## Домашнее задание `максимум 25 баллов (бывшее 10)`

## Критерии оценки 
`❗️Оцениваться будет значение метрики + ревью кода + реализация сервиса.` 

Вы можете сделать НЕ ВСЕ пункты и все равно получить 25 баллов. Получение > 25 баллов будет расцениваться как 25.


### 1. Побейте метрику на лидерборде `map@10 = 0.063` для userKnn модели с семинара (`4 балла`)


### 2. Предоставьте ноутбук(и) с экспериментами (`16 баллов`)

Что можно сделать:
   - сделать кол-во рекомендаций не меньше N (`2 балла`)
   - наличие тюнинга гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...)) (`4 балла`)
   - другие варианты ранжированивания айтемов похожих пользователей (`2 балла`)
   - эксперименты с оффлайн валидацией (`2 балла`)
   - в тесте вас ждут холодные пользователи. Сделайте рекомендации для них (обратите внимание на <a href="https://rectools.readthedocs.io/en/latest/api/rectools.models.popular.html"> rectools.models.popular</a>) (`2 балла`)
   - блендинг моделей (`4 балла`)


### 3. Оберните модель в сервис.
- **предпочтительный онлайн вариант**: обучаете модель в ноутбуке, сохраняете обученную модель (pickle, dill), при запуске сервиса ее поднимаете и запрашиваете рекомендации "на лету" (`9 баллов`)
- или оффлайн вариант: предварительно посчитайте рекомендации для всех пользователей, сохраните и запрашивайте их (`4 балла`)
   

### Хороший код ДЗ это:
- комментарии и объяснения. В ipynb пользуйтесь силой маркдауна. 
В скриптах пишите комментарии и докстринг. 
- легкая читаемость и воспроизводимость
- стандарт PEP8 
- обоснование схемы валидации
- анализ метрики качества 

## Imports

In [None]:
from pathlib import Path
from collections import Counter
import optuna
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.dataset import Dataset
from rectools import Columns
from rectools.metrics import Precision, Recall, MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.model_selection import TimeRangeSplitter

import dill
import json
from pprint import pprint
import numpy as np
import scipy as sp
import time
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

## Подинимаем данные

In [None]:
DATA_PATH = Path("data_original")

In [None]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [None]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [None]:
interactions.head()

In [None]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

In [None]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

## Разбиваем на train/test

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [None]:
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(dataset.interactions)}")


In [None]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(dataset.interactions, collect_fold_stats=True).__next__()

In [None]:
train_ids

In [None]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

## Сравнение с ItemKnn

In [None]:
item_knn = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=30))
item_knn.fit(dataset);

In [None]:
# take a look at the recommended items by the simple itemknn model
recs_itemknn = item_knn.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False  # False - same items to every user
)

In [None]:
recs_itemknn.head()

## Обучение userKnn с различными мерами расстояния

### Подготовка данных

In [None]:
max_date = interactions[Columns.Datetime].max()

In [None]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [None]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

In [None]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [None]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Подготовка фичей

#### Пользовательские фичи

In [None]:
# % не большой => можно заменить пустые значения
users.fillna('Unknown', inplace=True)

In [None]:
# оставляем у df users только тех, кто попал в train
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [None]:
users

In [None]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

#### Фичи фильмов

In [None]:
# оставляем у df users только тех, кто попал в train
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

#### Жанр

In [None]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

#### Содержание

In [None]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

#### Режисер

In [None]:
items["director"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
director_feature = items[["item_id", "director"]].explode("director")
director_feature.columns = ["id", "value"]
director_feature["feature"] = "director"
director_feature.head()

#### Страна

In [None]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

#### Год Выпуска

In [None]:
year_feature = items.reindex(columns=[Columns.Item, "release_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "release_year"
year_feature.head()

In [None]:
# Объединяем фичи
item_features = pd.concat((genre_feature, content_feature, country_feature, year_feature, director_feature))
item_features

In [None]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)


pprint(metrics)

### Обучение

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

In [None]:
dataset.interactions.df

In [None]:
K_RECOS = 10

model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=100, K1=0.05, B=0.1, num_threads=2))
model.fit(dataset)
recos = model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)


In [None]:
model.predict(
    users=[123],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [None]:
results = []
model_quality = {'model': 'BM25Recommender'}
metric_values = calc_metrics(metrics, recos, test, train)
model_quality.update(metric_values)
results.append(model_quality)

df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(model, f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_BM25Recommender_0.095432.dill', 'wb') as f:
    dill.dump(dataset, f)

### Подбор гиперпараметров

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)
TEST_USERS = test[Columns.User].unique()

results_opto = []

def objective(trial):
    # общие параметры
    K_RECOS = 10
    RANDOM_STATE = 42

    reco_model = trial.suggest_categorical("reco_model", ["BM25Recommender", 
                                                          "CosineRecommender",
                                                          "TFIDFRecommender"])
    model_quality_opto = {"model": f"{reco_model}_{trial.number}"}

    if reco_model == "BM25Recommender":
        # гиперпараметры для BM25Recommender
        K = trial.suggest_int("K", 100, 500, 50, log=False)
        K1 = trial.suggest_float("K1", 0.01, 0.09, log=False)
        B = trial.suggest_float("B", 0.01, 0.5, log=False)
        # Инициализация BM25Recommender
        model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=K, K1=K1, B=B, num_threads=2))
        
    elif reco_model == "CosineRecommender":
        # гиперпараметры для CosineRecommender
        K = trial.suggest_int("K", 50, 200, 50, log=False)
        # Инициализация CosineRecommender
        model = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=K))
        
    elif reco_model == "TFIDFRecommender":
        # гиперпараметры для TFIDFRecommender
        K = trial.suggest_int("K", 10, 100, 20, log=False)
        # Инициализация TFIDFRecommender
        model = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=K))
        
    # обучение модели
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    # Подсчет метрик
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality_opto.update(metric_values)
    results_opto.append(model_quality_opto)

    return metric_values.get('MAP@10') # максимизируемая метрика

In [None]:
# запуск подбора гиперпараметров
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

pprint(f"Number of finished trials: {len(study.trials)}")
trial = study.best_trial
pprint(f"Best trial: {trial}")

In [None]:
df_quality = pd.DataFrame(results_opto).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [None]:
df_quality = df_quality.T.drop_duplicates().T

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

## Обучение модели с лучшими параметрами

### подготавливаем матрицы

### Создаем маппинг для users и items

In [None]:
dataset.interactions.df

In [None]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}


items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [None]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

### Получаем разреженную матрицу

In [None]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

### Обучаем ItemKNN

In [None]:
userknn = BM25Recommender(K=50, K1=0.012556305101247701, B=0.05289835164246949, num_threads=2)
userknn.fit(interaction_matrix)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(userknn, f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'wb') as f:
    dill.dump(dataset, f)

## Inference ItemKNN

### Предподготовка для ускорения инференса по одному пользователю

In [None]:
dill_file = Path().cwd().parent / 'service' / 'models'

with open(dill_file / 'userknn_BM25Recommender.dill', 'rb') as f:
    userknn = dill.load(f)

In [None]:
dill_file = Path().cwd().parent / 'service' / 'data'

with open(dill_file / 'dataset_userknn_BM25Recommender.dill', 'rb') as f:
    dataset = dill.load(f)

In [None]:
train = dataset.interactions.df

In [None]:
user_id = 6064
K_RECOS = 10
N = 50
cnt = Counter(train['item_id'].values)
idf = pd.DataFrame.from_dict(cnt, orient='index', columns=['doc_freq']).reset_index()
n = train.shape[0]
idf['idf'] = idf['doc_freq'].apply(lambda x: np.log((1 + n) / (1 + x) + 1))

def generate_implicit_recs_mapper(model, N, users_mapping, users_inv_mapping):
    def _recs_mapper(user):
        user_id = users_mapping[user]
        recs = model.similar_items(user_id, N=N)
        return [users_inv_mapping[user] for user, _ in recs], [sim for _, sim in recs]
    return _recs_mapper

In [None]:
mapper = generate_implicit_recs_mapper(
    userknn, 
    N=N,
    users_mapping=users_mapping,
    users_inv_mapping=users_inv_mapping
)

In [None]:
watched = train.groupby('user_id').agg({'item_id': list})
watched.head()

In [None]:
wwatched = {k:v['item_id'] for k,v in json.loads(watched.T.to_json()).items()}

In [None]:
# user_id = 33
def make_reco_fast(user_id, mapper, k_recos):
    recss = {}
    recss['similar_user_id'], recss['similarity'] = mapper(user_id)
    recss['similar_user_id'] = recss['similar_user_id'][1:]
    recss['similarity'] = recss['similarity'][1:]

    recss['item_id'] = [wwatched.get(f"{x}") for x in recss['similar_user_id']]
    recs = pd.DataFrame(recss)
    recs = recs.explode('item_id')
    recs = recs.sort_values(['similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['rank_idf'], ascending=False)
    recs.dropna(inplace=True)
    return recs['item_id'].unique()[:k_recos]

In [None]:
def make_reco(user_id, mapper, k_recos):
    recs = pd.DataFrame({
        'user_id': train[train['user_id'] == user_id]['user_id'].unique()
    })
    recs['similar_user_id'], recs['similarity'] = zip(*recs['user_id'].map(mapper))


    # explode lists to get vertical representation
    recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

    # delete recommendations of itself 
    recs = recs[~(recs['user_id'] == recs['similar_user_id'])]
    
    # join watched items
    recs = recs.merge(watched, left_on=['similar_user_id'], right_on=['user_id'], how='left')
    recs = recs.explode('item_id')
    # drop duplicates pairs user_id-item_id 
    # keep with the largest similiarity
    recs = recs.sort_values(['user_id', 'similarity'], ascending=False)
    recs = recs.merge(idf[['index', 'idf']], 
                        left_on='item_id',
                        right_on='index',
                        how='left').drop(['index'], axis=1)
    recs['rank_idf'] = recs['similarity'] * recs['idf']
    recs = recs.sort_values(['user_id', 'rank_idf'], ascending=False)
    recs['rank'] = recs.groupby('user_id').cumcount() + 1 
    return recs[recs['rank'] <= k_recos]['item_id'].values

In [None]:
user_id = 100
make_reco(user_id, mapper, K_RECOS)

In [None]:
make_reco_fast(user_id, mapper, K_RECOS)