## Домашнее задание `максимум 25 баллов (бывшее 10)`

## Критерии оценки 
`❗️Оцениваться будет значение метрики + ревью кода + реализация сервиса.` 

Вы можете сделать НЕ ВСЕ пункты и все равно получить 25 баллов. Получение > 25 баллов будет расцениваться как 25.


### 1. Побейте метрику на лидерборде `map@10 = 0.063` для userKnn модели с семинара (`4 балла`)


### 2. Предоставьте ноутбук(и) с экспериментами (`16 баллов`)

Что можно сделать:
   - сделать кол-во рекомендаций не меньше N (`2 балла`)
   - наличие тюнинга гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...)) (`4 балла`)
   - другие варианты ранжированивания айтемов похожих пользователей (`2 балла`)
   - эксперименты с оффлайн валидацией (`2 балла`)
   - в тесте вас ждут холодные пользователи. Сделайте рекомендации для них (обратите внимание на <a href="https://rectools.readthedocs.io/en/latest/api/rectools.models.popular.html"> rectools.models.popular</a>) (`2 балла`)
   - блендинг моделей (`4 балла`)


### 3. Оберните модель в сервис.
- **предпочтительный онлайн вариант**: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету" (`9 баллов`)
- или оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их (`4 балла`)
   

### Хороший код ДЗ это:
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. 
В скриптах пишите комментарии и докстринг. 
- легкая читаемость и воспроизводимость
- стандарт PEP8 
- обоснование схемы валидации
- анализ метрики качества 

## Imports

In [1]:
from pathlib import Path
from collections import Counter
import optuna
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.dataset import Dataset
from rectools import Columns
from rectools.metrics import Precision, Recall, MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.model_selection import TimeRangeSplitter

import dill
import json
from pprint import pprint
import numpy as np
import scipy as sp
import time
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

## Подинимаем данные

In [2]:
DATA_PATH = Path("data_original")

In [3]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [6]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [7]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


## Разбиваем на train/test

In [8]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [9]:
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(dataset.interactions)}")


Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [10]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(dataset.interactions, collect_fold_stats=True).__next__()

In [11]:
train_ids

array([      0,       1,       2, ..., 5476245, 5476247, 5476249])

In [12]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

## Обучение модели PopularModel

In [None]:
pop = PopularModel()
pop.fit(dataset)

## Сравнение с ItemKnn

In [13]:
item_knn = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=30))
item_knn.fit(dataset);

In [14]:
# take a look at the recommended items by the simple itemknn model
recs_itemknn = item_knn.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # False - same items to every user
)

In [15]:
recs_itemknn.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,20449.246972,1
1,1016458,4038,14580.000128,2
2,1016458,12192,8147.508352,3
3,1016458,1986,8044.704301,4
4,1016458,734,8043.999968,5


## Обучение userKnn с различными мерами расстояния

### Подготовка данных

In [16]:
max_date = interactions[Columns.Datetime].max()

In [17]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [18]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (490982, 5)


In [19]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [20]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Подготовка фичей

#### Пользовательские фичи

In [21]:
# % не большой => можно заменить пустые значения
users.fillna('Unknown', inplace=True)

In [22]:
# оставляем у df users только тех, кто попал в train
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [23]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1
840190,393868,age_25_34,income_20_40,М,0
840192,339025,age_65_inf,income_0_20,Ж,0


In [24]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


#### Фичи фильмов

In [25]:
# оставляем у df users только тех, кто попал в train
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

#### Жанр

In [26]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


#### Содержание

In [27]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


#### Режисер

In [28]:
items["director"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
director_feature = items[["item_id", "director"]].explode("director")
director_feature.columns = ["id", "value"]
director_feature["feature"] = "director"
director_feature.head()

Unnamed: 0,id,value,feature
0,10711,педро альмодовар,director
1,2508,скот армстронг,director
2,10716,адам п. калтраро,director
3,7868,эндрю хэй,director
4,16268,виктор садовский,director


#### Страна

In [29]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


#### Год Выпуска

In [30]:
year_feature = items.reindex(columns=[Columns.Item, "release_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "release_year"
year_feature.head()

Unnamed: 0,id,value,feature
0,10711,2002.0,release_year
1,2508,2014.0,release_year
2,10716,2011.0,release_year
3,7868,2015.0,release_year
4,16268,1978.0,release_year


In [31]:
# Объединяем фичи
item_features = pd.concat((genre_feature, content_feature, country_feature, year_feature, director_feature))
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15960,10632,амир камдин,director
15960,10632,эрик эгер,director
15961,4538,марк о’коннор,director
15961,4538,конор макмахон,director


In [32]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


pprint(metrics)

{'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'Precision@1': Precision(k=1),
 'Precision@10': Precision(k=10),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Recall@1': Recall(k=1),
 'Recall@10': Recall(k=10),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9)}


### Обучение

In [33]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

In [34]:
dataset.interactions.df

Unnamed: 0,user_id,item_id,weight,datetime
0,0,0,3.0,2021-05-11
1,1,1,3.0,2021-05-29
2,2,2,1.0,2021-05-09
3,3,3,3.0,2021-07-05
4,4,0,3.0,2021-04-30
...,...,...,...,...
5476244,69627,219,3.0,2021-08-02
5476245,40052,132,1.0,2021-05-12
5476246,896790,318,1.0,2021-08-13
5476247,206604,2546,3.0,2021-04-13


In [None]:
K_RECOS = 10

model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=100, K1=0.05, B=0.1, num_threads=2))
model.fit(dataset)
recos = model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)


In [None]:
model.predict(
    users=[123],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [None]:
results = []
model_quality = {'model': 'BM25Recommender'}
metric_values = calc_metrics(metrics, recos, test, train)
model_quality.update(metric_values)
results.append(model_quality)

df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(model, f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(dataset, f)

### Подбор гиперпараметров

In [35]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

results_opto = []

def objective(trial):
    # общие параметры
    K_RECOS = 10
    RANDOM_STATE = 42

    reco_model = trial.suggest_categorical("reco_model", ["BM25Recommender", 
                                                          "CosineRecommender",
                                                          "TFIDFRecommender"])
    model_quality_opto = {"model": f"{reco_model}_{trial.number}"}

    if reco_model == "BM25Recommender":
        # гиперпараметры для BM25Recommender
        K = trial.suggest_int("K", 100, 500, 50, log=False)
        K1 = trial.suggest_float("K1", 0.01, 0.09, log=False)
        B = trial.suggest_float("B", 0.01, 0.5, log=False)
        # Инициализация BM25Recommender
        model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=K, K1=K1, B=B, num_threads=2))
        
    elif reco_model == "CosineRecommender":
        # гиперпараметры для CosineRecommender
        K = trial.suggest_int("K", 50, 200, 50, log=False)
        # Инициализация CosineRecommender
        model = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=K))
        
    elif reco_model == "TFIDFRecommender":
        # гиперпараметры для TFIDFRecommender
        K = trial.suggest_int("K", 10, 100, 20, log=False)
        # Инициализация TFIDFRecommender
        model = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=K))
        
    # обучение модели
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    # Подсчет метрик
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality_opto.update(metric_values)
    results_opto.append(model_quality_opto)

    return metric_values.get('MAP@10') # максимизируемая метрика

In [37]:
# запуск подбора гиперпараметров
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

pprint(f"Number of finished trials: {len(study.trials)}")
trial = study.best_trial
pprint(f"Best trial: {trial}")

[32m[I 2022-12-13 15:57:26,490][0m A new study created in memory with name: no-name-6a7c9814-a55a-47bd-a7c5-ae140cc75686[0m
[32m[I 2022-12-13 15:57:37,360][0m Trial 0 finished with value: 0.080213683554881 and parameters: {'reco_model': 'TFIDFRecommender', 'K': 50}. Best is trial 0 with value: 0.080213683554881.[0m
[32m[I 2022-12-13 15:57:49,016][0m Trial 1 finished with value: 0.08046789346619015 and parameters: {'reco_model': 'TFIDFRecommender', 'K': 70}. Best is trial 1 with value: 0.08046789346619015.[0m
[32m[I 2022-12-13 15:58:01,887][0m Trial 2 finished with value: 0.07948747637554937 and parameters: {'reco_model': 'CosineRecommender', 'K': 150}. Best is trial 1 with value: 0.08046789346619015.[0m
[32m[I 2022-12-13 15:58:16,675][0m Trial 3 finished with value: 0.08854371896601809 and parameters: {'reco_model': 'BM25Recommender', 'K': 350, 'K1': 0.050714936742629564, 'B': 0.022260814016119207}. Best is trial 3 with value: 0.08854371896601809.[0m
[32m[I 2022-12-13 1

[32m[I 2022-12-13 16:06:03,996][0m Trial 35 finished with value: 0.07964252966443905 and parameters: {'reco_model': 'CosineRecommender', 'K': 200}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:06:16,498][0m Trial 36 finished with value: 0.08068685313076739 and parameters: {'reco_model': 'TFIDFRecommender', 'K': 90}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:06:32,628][0m Trial 37 finished with value: 0.08847305963231346 and parameters: {'reco_model': 'BM25Recommender', 'K': 400, 'K1': 0.049716481757783926, 'B': 0.011137683927086944}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:06:50,061][0m Trial 38 finished with value: 0.08548005708412469 and parameters: {'reco_model': 'BM25Recommender', 'K': 350, 'K1': 0.06603657207162444, 'B': 0.13572990336626173}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:07:04,060][0m Trial 39 finished with value: 0.0885120481549458 

[32m[I 2022-12-13 16:14:51,328][0m Trial 69 finished with value: 0.08851976779297428 and parameters: {'reco_model': 'BM25Recommender', 'K': 400, 'K1': 0.01971729976552715, 'B': 0.07145893685348058}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:15:05,129][0m Trial 70 finished with value: 0.07964252966443905 and parameters: {'reco_model': 'CosineRecommender', 'K': 200}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:15:20,915][0m Trial 71 finished with value: 0.08840491446220693 and parameters: {'reco_model': 'BM25Recommender', 'K': 450, 'K1': 0.016490679973023677, 'B': 0.029179441391368195}. Best is trial 21 with value: 0.0885579403475159.[0m
[32m[I 2022-12-13 16:15:36,641][0m Trial 72 finished with value: 0.08856761753913403 and parameters: {'reco_model': 'BM25Recommender', 'K': 450, 'K1': 0.03051069758953803, 'B': 0.033487916403253035}. Best is trial 72 with value: 0.08856761753913403.[0m
[32m[I 2022-12-13 16:15:51,871

'Number of finished trials: 100'
('Best trial: FrozenTrial(number=91, values=[0.08857784132207068], '
 'datetime_start=datetime.datetime(2022, 12, 13, 16, 20, 20, 847081), '
 'datetime_complete=datetime.datetime(2022, 12, 13, 16, 20, 34, 979059), '
 "params={'reco_model': 'BM25Recommender', 'K': 300, 'K1': "
 "0.038188799072342246, 'B': 0.027325616080210426}, "
 "distributions={'reco_model': "
 "CategoricalDistribution(choices=('BM25Recommender', 'CosineRecommender', "
 "'TFIDFRecommender')), 'K': IntDistribution(high=500, log=False, low=100, "
 "step=50), 'K1': FloatDistribution(high=0.09, log=False, low=0.01, "
 "step=None), 'B': FloatDistribution(high=0.5, log=False, low=0.01, "
 'step=None)}, user_attrs={}, system_attrs={}, intermediate_values={}, '
 'trial_id=91, state=TrialState.COMPLETE, value=None)')


In [38]:
df_quality = pd.DataFrame(results_opto).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [39]:
df_quality = df_quality.T.drop_duplicates().T

In [40]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,TFIDFRecommender_0,TFIDFRecommender_1,CosineRecommender_2,BM25Recommender_3,CosineRecommender_4,TFIDFRecommender_5,BM25Recommender_6,BM25Recommender_7,BM25Recommender_8,BM25Recommender_9,BM25Recommender_10,BM25Recommender_11,BM25Recommender_12,BM25Recommender_13,BM25Recommender_14,BM25Recommender_16,BM25Recommender_17,CosineRecommender_18,BM25Recommender_20,BM25Recommender_21,BM25Recommender_22,BM25Recommender_23,BM25Recommender_24,BM25Recommender_25,BM25Recommender_26,BM25Recommender_27,TFIDFRecommender_29,BM25Recommender_31,BM25Recommender_32,BM25Recommender_33,BM25Recommender_34,BM25Recommender_37,BM25Recommender_38,BM25Recommender_39,BM25Recommender_40,BM25Recommender_41,BM25Recommender_42,BM25Recommender_43,BM25Recommender_44,BM25Recommender_45,BM25Recommender_47,BM25Recommender_49,BM25Recommender_50,BM25Recommender_51,BM25Recommender_52,BM25Recommender_53,BM25Recommender_54,BM25Recommender_55,BM25Recommender_56,BM25Recommender_57,BM25Recommender_59,BM25Recommender_60,BM25Recommender_61,BM25Recommender_62,BM25Recommender_63,BM25Recommender_64,BM25Recommender_65,BM25Recommender_66,BM25Recommender_68,BM25Recommender_69,BM25Recommender_71,BM25Recommender_72,BM25Recommender_73,BM25Recommender_74,BM25Recommender_75,BM25Recommender_76,BM25Recommender_77,BM25Recommender_78,BM25Recommender_80,BM25Recommender_81,BM25Recommender_82,BM25Recommender_83,BM25Recommender_84,BM25Recommender_85,BM25Recommender_86,BM25Recommender_87,BM25Recommender_88,BM25Recommender_90,BM25Recommender_91,BM25Recommender_92,BM25Recommender_93,BM25Recommender_94,BM25Recommender_95,BM25Recommender_96,BM25Recommender_97,BM25Recommender_98
Precision@1,0.083165,0.083323,0.083257,0.090973,0.083315,0.079473,0.088467,0.091048,0.090907,0.090625,0.091123,0.090923,0.08938,0.09065,0.073208,0.090011,0.060754,0.08285,0.090409,0.090708,0.090243,0.091131,0.087862,0.08602,0.090899,0.085953,0.083364,0.091098,0.090749,0.090824,0.091031,0.090318,0.08836,0.090907,0.089795,0.090334,0.091064,0.090932,0.090824,0.09104,0.08821,0.086966,0.091172,0.090915,0.0906,0.091056,0.090799,0.090982,0.08855,0.089886,0.075855,0.091106,0.091147,0.091098,0.090824,0.091031,0.089861,0.076826,0.090923,0.090907,0.090002,0.090973,0.091114,0.090708,0.09065,0.090957,0.09094,0.087347,0.089795,0.090923,0.090982,0.09094,0.091015,0.09094,0.089878,0.090774,0.09016,0.090973,0.091015,0.090641,0.091123,0.090691,0.090625,0.091081,0.090998,0.090691
Recall@1,0.041777,0.041703,0.041644,0.047413,0.041663,0.040013,0.044896,0.047262,0.047384,0.046537,0.047429,0.047344,0.045444,0.046539,0.03575,0.047238,0.028633,0.04155,0.04628,0.047477,0.046091,0.047416,0.044474,0.043324,0.047138,0.043269,0.041859,0.047397,0.047453,0.046664,0.047242,0.0474,0.04484,0.047363,0.045777,0.047341,0.047346,0.047161,0.047,0.047334,0.044789,0.043929,0.047432,0.047382,0.047413,0.047356,0.046672,0.04726,0.044968,0.047214,0.03736,0.047414,0.047423,0.047413,0.046678,0.047235,0.045724,0.03794,0.047483,0.04738,0.047251,0.047499,0.04742,0.047506,0.047436,0.047175,0.047378,0.044127,0.047146,0.047346,0.047208,0.047404,0.047436,0.047433,0.047205,0.046602,0.046049,0.047488,0.047502,0.047419,0.047408,0.047492,0.047428,0.047359,0.047425,0.047045
Precision@2,0.069906,0.070408,0.070026,0.076523,0.070209,0.067417,0.075167,0.076747,0.076598,0.076561,0.076677,0.07666,0.075411,0.076561,0.062339,0.076349,0.057779,0.069923,0.076229,0.076469,0.076245,0.076677,0.074034,0.072126,0.076805,0.071976,0.0704,0.076685,0.076453,0.076718,0.076706,0.076378,0.074573,0.076602,0.076108,0.076403,0.076623,0.076822,0.076805,0.076623,0.074772,0.072706,0.076681,0.07661,0.076507,0.076623,0.076747,0.076652,0.075204,0.076328,0.064898,0.076656,0.076644,0.07666,0.076664,0.076702,0.075743,0.065654,0.076449,0.076639,0.076341,0.076486,0.076685,0.076457,0.076436,0.076726,0.076639,0.073171,0.076295,0.076631,0.07671,0.076502,0.076519,0.07649,0.076328,0.076614,0.076187,0.076498,0.076507,0.076453,0.076619,0.076444,0.076478,0.076627,0.076519,0.076743
Recall@2,0.068434,0.068666,0.068111,0.07796,0.068332,0.065669,0.075133,0.077913,0.078005,0.077386,0.077947,0.078047,0.075515,0.07738,0.059824,0.07785,0.054646,0.067977,0.076837,0.07796,0.076838,0.07797,0.073725,0.071491,0.077847,0.071289,0.068677,0.077956,0.07791,0.077663,0.077897,0.077884,0.074324,0.077976,0.076548,0.077918,0.077895,0.077859,0.077812,0.077887,0.074669,0.072095,0.077982,0.077972,0.077991,0.077892,0.0777,0.077916,0.075215,0.077855,0.062812,0.078047,0.077973,0.078051,0.077541,0.077922,0.07608,0.063695,0.077908,0.077961,0.077845,0.077953,0.077994,0.077953,0.077889,0.077828,0.078037,0.072752,0.077802,0.078039,0.077891,0.077949,0.077907,0.077916,0.077862,0.077443,0.076744,0.077978,0.077981,0.07791,0.07797,0.077931,0.077972,0.0779,0.077907,0.077747
Precision@3,0.060997,0.061196,0.060978,0.06826,0.061083,0.058663,0.065489,0.068141,0.068255,0.067262,0.06828,0.068291,0.066471,0.067254,0.056373,0.068036,0.053513,0.060829,0.066897,0.068155,0.066853,0.06831,0.06467,0.063381,0.067862,0.063188,0.061382,0.068271,0.068216,0.067602,0.068177,0.068108,0.065116,0.068252,0.066747,0.068122,0.068291,0.067873,0.067782,0.06828,0.06529,0.063697,0.068269,0.068288,0.068144,0.068255,0.067655,0.068274,0.065608,0.068,0.057443,0.068305,0.068341,0.068299,0.067525,0.068219,0.066664,0.057842,0.068222,0.068269,0.068028,0.068219,0.068324,0.068155,0.068202,0.0681,0.068249,0.064081,0.067998,0.068285,0.068202,0.068263,0.068235,0.068233,0.068003,0.067364,0.066819,0.068238,0.068252,0.068213,0.068332,0.068161,0.068183,0.068222,0.068241,0.067804
Recall@3,0.087023,0.087242,0.086909,0.102676,0.087136,0.084054,0.096556,0.102016,0.102641,0.100048,0.102364,0.102648,0.098403,0.099999,0.080323,0.102532,0.075529,0.08663,0.099362,0.102623,0.099286,0.102492,0.094926,0.092203,0.101476,0.091867,0.087719,0.102409,0.102662,0.100808,0.102123,0.102618,0.095741,0.102653,0.098924,0.102626,0.102372,0.101443,0.10132,0.102363,0.096128,0.09286,0.102443,0.102637,0.102636,0.10237,0.100918,0.102342,0.096752,0.102513,0.082018,0.102585,0.102576,0.102575,0.100607,0.10221,0.098755,0.082676,0.102653,0.102644,0.102537,0.102611,0.102512,0.102635,0.102697,0.102017,0.102651,0.093792,0.102513,0.102708,0.102164,0.102685,0.102641,0.10264,0.102507,0.100218,0.099175,0.102658,0.102676,0.102719,0.10258,0.102643,0.102688,0.1023,0.102644,0.10139
Precision@4,0.055761,0.05607,0.055437,0.061036,0.055452,0.053894,0.059418,0.060669,0.061021,0.060225,0.060912,0.060976,0.059748,0.06026,0.052627,0.060928,0.050314,0.055195,0.060071,0.061011,0.060063,0.060949,0.058781,0.056985,0.060642,0.056817,0.056296,0.060887,0.061051,0.060424,0.060729,0.060972,0.059063,0.060976,0.060003,0.060965,0.06081,0.060621,0.060488,0.060864,0.059217,0.057366,0.060999,0.060988,0.060976,0.060845,0.060439,0.060851,0.05953,0.06093,0.053226,0.060978,0.060984,0.060974,0.060366,0.06075,0.059955,0.053411,0.061003,0.060955,0.060928,0.061013,0.060957,0.061015,0.06103,0.060656,0.060965,0.058248,0.060928,0.060965,0.060737,0.061024,0.060992,0.061009,0.060934,0.060312,0.060038,0.061026,0.061024,0.06103,0.060974,0.061011,0.060984,0.060787,0.060995,0.060573
Recall@4,0.104778,0.105375,0.103754,0.120876,0.10384,0.10215,0.115467,0.11957,0.120756,0.118105,0.120256,0.120599,0.116407,0.118218,0.099224,0.120728,0.094253,0.103211,0.117567,0.120939,0.117551,0.120362,0.113939,0.109175,0.119445,0.108861,0.105803,0.120217,0.12099,0.118688,0.119744,0.120813,0.114598,0.120658,0.11721,0.120798,0.120052,0.119363,0.118962,0.120193,0.114987,0.110081,0.120457,0.120637,0.120858,0.120158,0.118709,0.120113,0.115636,0.12075,0.100318,0.120521,0.120489,0.120524,0.118541,0.119796,0.117001,0.100711,0.120898,0.12061,0.120731,0.120868,0.120407,0.120952,0.12098,0.119519,0.120627,0.112322,0.120767,0.120632,0.119766,0.120851,0.12081,0.120878,0.120757,0.118325,0.117443,0.120858,0.120855,0.120973,0.120484,0.120962,0.120925,0.120038,0.120793,0.119233
Precision@5,0.051728,0.051984,0.051087,0.05471,0.051172,0.049886,0.053552,0.054566,0.05475,0.05432,0.054735,0.054771,0.053754,0.05437,0.049504,0.054655,0.047632,0.05095,0.054162,0.054672,0.054156,0.054657,0.05318,0.052252,0.05461,0.052139,0.05211,0.054679,0.054672,0.054413,0.054577,0.054693,0.053331,0.054715,0.054005,0.054702,0.054642,0.054591,0.054481,0.054655,0.053439,0.052501,0.054683,0.054765,0.054687,0.054649,0.05444,0.05466,0.053611,0.054649,0.050087,0.054755,0.054732,0.054753,0.054393,0.054622,0.053909,0.050206,0.054665,0.054705,0.054649,0.054677,0.054662,0.054642,0.05466,0.05461,0.054722,0.053016,0.05462,0.05474,0.054592,0.054703,0.054702,0.054682,0.054649,0.05442,0.054119,0.054683,0.054683,0.054664,0.054725,0.054645,0.05466,0.054629,0.054712,0.054539
Recall@5,0.12065,0.121219,0.118324,0.1341,0.118492,0.117276,0.128924,0.133334,0.134154,0.132126,0.133845,0.134147,0.129828,0.132179,0.116281,0.13435,0.111315,0.117932,0.131338,0.134236,0.131298,0.133716,0.12766,0.124441,0.133269,0.124174,0.121545,0.133709,0.134176,0.132436,0.133337,0.134403,0.128242,0.134062,0.130805,0.134405,0.133599,0.133186,0.132729,0.133657,0.128559,0.125241,0.13379,0.134159,0.134325,0.133613,0.132538,0.133651,0.129068,0.134398,0.117875,0.134097,0.133936,0.134095,0.132367,0.133475,0.130533,0.118257,0.134097,0.134053,0.134342,0.134086,0.133752,0.134172,0.134155,0.133316,0.134064,0.126901,0.134339,0.13409,0.133395,0.134082,0.134076,0.134053,0.134406,0.132335,0.131221,0.134106,0.134102,0.134168,0.133939,0.13419,0.134255,0.133561,0.134087,0.133007


## Обучение модели с лучшими параметрами

### подготавливаем матрицы

### Создаем маппинг для users и items

In [41]:
dataset.interactions.df

Unnamed: 0,user_id,item_id,weight,datetime
0,0,0,3.0,2021-05-11
1,1,1,3.0,2021-05-29
2,2,2,1.0,2021-05-09
3,3,3,3.0,2021-07-05
4,4,0,3.0,2021-04-30
...,...,...,...,...
5476244,69627,219,3.0,2021-08-02
5476245,40052,132,1.0,2021-05-12
5476246,896790,318,1.0,2021-08-13
5476247,206604,2546,3.0,2021-04-13


In [42]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}


items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [43]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 896791
items_mapping amount: 15565


### Получаем разреженную матрицу

In [44]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

### Обучаем ItemKNN

In [None]:
userknn = BM25Recommender(K=50, K1=0.012556305101247701, B=0.05289835164246949, num_threads=2)
userknn.fit(interaction_matrix)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(userknn, f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(dataset, f)

## Inference ItemKNN

### Предподготовка для ускорения инференса по одному пользователю

In [47]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'rb') as f:
    userknn = dill.load(f)

In [48]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'rb') as f:
    dataset = dill.load(f)

In [None]:
train = dataset.interactions.df

In [49]:
user_id = 6064
K_RECOS = 10
N = 50
cnt = Counter(train['item_id'].values)
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [50]:
mapper = generate_implicit_recs_mapper(
    userknn, 
    N=N,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

In [51]:
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[7102, 14359, 15297, 6006, 9728, 12192]"
1,[10440]
2,"[7571, 3541, 15266, 13867, 12841, 10770, 4475,..."
3,"[12192, 9728, 16406, 15719, 10440, 3475, 4151,..."
4,"[4700, 6317]"


In [52]:
wwatched = {k:v['item_id'] for k,v in json.loads(watched.T.to_json()).items()}

In [53]:
# user_id = 33
def make_reco_fast(user_id, mapper, k_recos):
    recss = {}
    recss['similar_user_id'], recss['similarity'] = mapper(user_id)
    recss['similar_user_id'] = recss['similar_user_id'][1:]
    recss['similarity'] = recss['similarity'][1:]

    recss['item_id'] = [wwatched.get(f"{x}") for x in recss['similar_user_id']]
    recs = pd.DataFrame(recss)
    recs = recs.explode('item_id')
    recs = recs.sort_values(['similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['rank_idf'], ascending=False)
    recs.dropna(inplace=True)
    return recs['item_id'].unique()[:k_recos]

In [54]:
def make_reco(user_id, mapper, k_recos):
    recs = pd.DataFrame({
        'user_id': train[train['user_id'] == user_id]['user_id'].unique()
    })
    recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))


    # explode lists to get vertical representation
    recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

    # delete recommendations of itself 
    recs = recs[~(recs['user_id'] == recs['similar_user_id'])]
    
    # join watched items
    recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
    recs = recs.explode('item_id')
    # drop duplicates pairs user_id-item_id 
    # keep with the largest similiarity
    recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs['rank'] = recs.groupby('user_id').cumcount() + 1 
    return recs[recs['rank'] <= k_recos]['item_id'].values

In [55]:
user_id = 100
make_reco(user_id, mapper, K_RECOS)

array([7833, 9951, 10211, 12134, 2620, 1871, 9047, 9517, 9025, 13856],
      dtype=object)

In [56]:
make_reco_fast(user_id, mapper, K_RECOS)

array([7833, 9951, 2620, 12134, 10211, 1871, 9047, 9517, 13856, 9025],
      dtype=object)