In [1]:
import warnings

warnings.simplefilter("ignore")

import dill
import numpy as np
import pandas as pd
import requests
import datetime
import shap

from typing import Callable

from xgboost import XGBRanker
from catboost import CatBoostRanker, Pool

from pathlib import Path

from rectools.models import (
    ImplicitItemKNNWrapperModel,
    PopularModel,
    PureSVDModel,
)
from implicit.nearest_neighbours import (
    CosineRecommender,
    TFIDFRecommender,
    BM25Recommender,
)

import rectools.dataset as rd

from lightfm import LightFM
from lightfm.data import Dataset
from lightgbm import LGBMRanker, LGBMClassifier
from rectools.metrics import (
    calc_metrics,
    NDCG,
    MAP,
    Precision,
    Recall,
    MeanInvUserFreq,
)
from rectools import Columns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from typing import Any, Dict, Tuple
from tqdm.auto import tqdm
from zipfile import ZipFile

## Подготовка данных

Данные будут по Киону, так же как и в прошлых лекциях, чтобы можно было меньше отвлекаться на фича-инжиниринг.

In [2]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

PATH_TO_DATA = Path("../data/kion_train.zip")

if not PATH_TO_DATA.exists():
    with PATH_TO_DATA.open("wb") as f:
        total_size_in_bytes = int(req.headers.get("Content-Length", 0))
        progress_bar = tqdm(
            desc="kion dataset download",
            total=total_size_in_bytes,
            unit="iB",
            unit_scale=True,
        )
        for chunk in req.iter_content(chunk_size=2**20):
            progress_bar.update(len(chunk))
            f.write(chunk)

In [3]:
with ZipFile(PATH_TO_DATA) as z:

    with z.open("kion_train/interactions.csv") as f:
        interactions = pd.read_csv(f)

    with z.open("kion_train/items.csv") as f:
        items = pd.read_csv(f)

    with z.open("kion_train/users.csv") as f:
        users = pd.read_csv(f)

In [4]:
# Для обучения используются только cols:
cols = [
    "itemknn_score",
    "itemknn_rank",
    "age",
    "income",
    "sex",
    "kids_flg",
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    "content_type",
    "release_year",
    "for_kids",
    "age_rating",
    "studios",
    "item_pop",
    "item_avg_hist",
]
# Из них категориальные:
cat_cols = [
    "age",
    "income",
    "sex",
    "kids_flg",
    "content_type",
    "for_kids",
    "studios",
]

### `interactions`: взаимодействия пользователь - айтем 
- с датой `last_watch_dt`
- длительностью просмотра `total_dur`
- % просмотра `watched_pct`

In [5]:
print(interactions.shape)
interactions.head(3)

(5476251, 5)


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0


In [6]:
# Меняем названия колонок для использования rectools
interactions.rename(
    columns={
        "last_watch_dt": Columns.Datetime,
        "watched_pct": Columns.Weight,
    },
    inplace=True,
)
# Меняем тип данных
interactions["datetime"] = interactions["datetime"].astype(np.datetime64)

# Заполняем пропуски
interactions_default_values: Dict[str, Any] = {
    Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.0,
    "total_dur": 0,
}
interactions.fillna(interactions_default_values, inplace=True)

interactions.head(10)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5,1032142,6686,2021-05-13,11286,100.0
6,1016458,354,2021-08-14,1672,25.0
7,884009,693,2021-08-04,703,14.0
8,648682,1449,2021-06-13,26246,75.0
9,203219,13582,2021-08-22,6975,100.0


In [7]:
interactions.describe(datetime_is_numeric=True)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
count,5476251.0,5476251.0,5476251,5476251.0,5476251.0
mean,549115.9,8485.86,2021-06-23 11:56:04.296268032,8295.49,46.36699
min,0.0,0.0,2021-03-13 00:00:00,1.0,0.0
25%,275014.0,4326.0,2021-05-25 00:00:00,371.0,4.0
50%,549364.0,8776.0,2021-07-01 00:00:00,2898.0,32.0
75%,822754.0,12544.0,2021-07-30 00:00:00,7116.0,100.0
max,1097557.0,16518.0,2021-08-22 00:00:00,80411670.0,100.0
std,316607.7,4740.762,,48567.99,41.91923


### `users`: данные о пользователях

- `age` бин по возрасту 
- `income` бин по доходу 
- `sex` пол 
- `kids_flg` флаг наличия детей

Все признаки - результат предсказания соцдем моделей

In [8]:
print(users.shape)
users.head(3)

(840197, 5)


Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [9]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}
    for col in cat_cols:
        cat_col = df[col].astype("category").cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype("category")
    return df, cat_col_encoding


users_cat_cols = [
    "age",
    "income",
    "sex",
    "kids_flg",
]
users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)

# None уже кодируется как -1
users_cat_col_encoding["income"], users["income"].unique()

(Index(['income_0_20', 'income_150_inf', 'income_20_40', 'income_40_60',
        'income_60_90', 'income_90_150'],
       dtype='object'),
 [4, 2, 3, 0, -1, 5, 1]
 Categories (7, int64): [-1, 0, 1, 2, 3, 4, 5])

In [10]:
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,1,4,1,1
1,962099,0,2,1,0
2,1047345,3,3,0,0


In [11]:
# Наиболее популярны income - income_40_60
users.describe(include="category")

Unnamed: 0,age,income,sex,kids_flg
count,840197,840197,840197,840197
unique,7,7,3,2
top,1,2,0,0
freq,233926,471519,425270,587209


### `items`: данные об айтемах

- `content_type` - тип контента
- `title` - название на русском
- `title_orig` - название оригинальное
- `release_year` - год выпуска
- `countries` - страны
- `for_kids` - флаг контент для детей
- `age_rating`- Возрастной рейтинг
- `studios` - студии
- `directors` - режиссеры
- `actors`- актеры
- `keywords` - ключевые слова 
- `description` - описание

In [12]:
print(items.shape)
items.head(3)

(15963, 14)


Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [13]:
items_cat_cols = [
    "content_type",
    "for_kids",
    "studios",
]
items_text_cols = [
    "title",
    "title_orig",
    "genres",
    "countries",
    "directors",
    "actors",
    "description",
    "keywords",
]
items_num_cols = [
    "release_year",
    "age_rating",
]
default_values_items = {
    "release_year": items["release_year"].median(),
    "age_rating": items["age_rating"].median(),
}

items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols)
items = items.drop(items_text_cols, axis=1)
items.fillna(default_values_items, inplace=True)

items_cat_col_encoding["studios"]

Index(['ABC', 'Amediateka', 'BBC', 'CBS', 'CBS All Access', 'Channel 4',
       'Cinemax', 'DAZN', 'Disney', 'Endemol', 'FX', 'Fox', 'Fremantle', 'HBO',
       'HBO Max', 'HBO, BBC', 'Legendary', 'MGM', 'New Regency Productions',
       'Paramount', 'Showtime', 'Sky', 'Sky, Fremantle', 'Sony Pictures',
       'Sony Pictures Television', 'Sony Pictures, рентв', 'Sony Plus',
       'Sony Plus, рентв', 'Starz', 'Universal', 'Universal, рентв',
       'Warner Bros', 'Warner Bros. Television', 'Ленфильм', 'Ленфильм, рентв',
       'Мосфильм', 'Рок фильм', 'рентв'],
      dtype='object')

In [14]:
items.describe(include="all")

Unnamed: 0,item_id,content_type,release_year,for_kids,age_rating,studios
count,15963.0,15963.0,15963.0,15963.0,15963.0,15963.0
unique,,2.0,,3.0,,39.0
top,,0.0,,-1.0,,-1.0
freq,,12002.0,,15397.0,,14898.0
mean,8268.453424,,2007.969492,,12.945937,
std,4773.841513,,16.83589,,5.49576,
min,0.0,,1897.0,,0.0,
25%,4141.5,,2007.0,,12.0,
50%,8266.0,,2014.0,,16.0,
75%,12414.5,,2018.0,,16.0,


In [15]:
items.head()

Unnamed: 0,item_id,content_type,release_year,for_kids,age_rating,studios
0,10711,0,2002.0,-1,16.0,-1
1,2508,0,2014.0,-1,16.0,-1
2,10716,0,2011.0,-1,16.0,-1
3,7868,0,2015.0,-1,16.0,-1
4,16268,0,1978.0,-1,12.0,33


## Трейн-вал-тест сплит

In [16]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")
print(f"Продолжительность: {max_date - min_date}")

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [17]:
ranker_days_count = 30

ranker_data = interactions[
    (
        interactions[Columns.Datetime]
        >= max_date - pd.Timedelta(days=ranker_days_count)
    )
]

train_size = 0.7
val_size = 0.15
test_size = 0.15

train_val_users, test_users = train_test_split(
    ranker_data["user_id"].unique(), random_state=42, test_size=test_size
)

train_users, val_users = train_test_split(
    train_val_users,
    random_state=42,
    test_size=val_size / (train_size + val_size),  # 15% от общего размера
)

In [18]:
base_models_data = interactions[
    (
        interactions[Columns.Datetime]
        < max_date - pd.Timedelta(days=ranker_days_count)
    )
]

In [19]:
base_model_dataset = rd.Dataset.construct(
    interactions_df=base_models_data,
    user_features_df=None,
    item_features_df=None,
)

## Обучаем модель первого уровня

In [20]:
def generate_itemknn_model_recs_mapper(
    model: ImplicitItemKNNWrapperModel,
    N: int,
    dataset: rd.Dataset,
) -> Callable:
    """Возвращает функцию для генерации рекомендаций в формате item_ids, scores"""
    user_items: sparse.csr_matrix = dataset.get_user_item_matrix(
        include_weights=True
    )
    item_id_map: IdMap = dataset.item_id_map
    to_intornal_user_id: Series = dataset.user_id_map.to_internal

    def _recs_mapper(user):
        user_id = to_intornal_user_id[user]
        rec_ids, scores = model._recommend_for_user(
            user_id,
            user_items,
            N,
            filter_viewed=True,
            sorted_item_ids=None,
        )
        return item_id_map.convert_to_external(rec_ids).tolist(), scores

    return _recs_mapper

In [21]:
model = ImplicitItemKNNWrapperModel(BM25Recommender(K=20))

In [22]:
model.fit(base_model_dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f0eacaefc40>

In [23]:
# save model
model_path = Path("../data/hw5/models/itemknn_model.dill")
model_path.parent.mkdir(parents=True, exist_ok=True)
with model_path.open("wb") as f:
    dill.dump(model, f)

## Генерим кандидатов, которыми дополним датасет ранкера

In [24]:
# Топ 100 кандидатов
top_N = 100

mapper = generate_itemknn_model_recs_mapper(
    model=model,
    N=top_N,
    dataset=base_model_dataset,
)

In [25]:
candidates_path = Path("../data/hw5/candidates.csv")
if candidates_path.exists():
    # Загружаем список айтемов-кандидатов.
    candidates = pd.read_csv(candidates_path)
else:
    # Генерируем предсказания и получаем скоры и ранги
    candidates = pd.DataFrame(
        {"user_id": base_models_data["user_id"].unique()}
    )
    candidates["item_id"], candidates["itemknn_score"] = zip(
        *tqdm(
            map(mapper, candidates["user_id"]),
            total=len(candidates["user_id"]),
        )
    )
    candidates = candidates.explode(
        ["item_id", "itemknn_score"], ignore_index=True
    )
    candidates.head(3)

    # Удаляем NaN примеры
    candidates = candidates.dropna()
    candidates["itemknn_rank"] = candidates.groupby("user_id").cumcount() + 1

    # Делаем чекпоинт - сохраняем кандидатов
    candidates_path.parent.mkdir(parents=True, exist_ok=True)
    candidates.to_csv(candidates_path, index=False)
candidates.head(3)

Unnamed: 0,user_id,item_id,itemknn_score,itemknn_rank
0,176549,7571,49795630.0,1
1,176549,11985,39361310.0,2
2,176549,16270,35120650.0,3


In [26]:
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    metrics = {
        "ndcg@10": NDCG(k=10),
        "map@10": MAP(k=10),
        "Precision@10": Precision(k=10),
        "recall@10": Recall(k=10),
        "novelty@10": MeanInvUserFreq(k=10),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df.rename(columns={rank_col: Columns.Rank})[
                [Columns.User, Columns.Item, Columns.Rank]
            ][candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data[
                [Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]
            ][ranker_data[Columns.User].isin(test_users)]
        ),
        prev_interactions=(
            base_models_data[
                [Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]
            ][base_models_data[Columns.User].isin(test_users)]
        ),
        catalog=items["item_id"].unique(),
    )


models_metrics: Dict[str, Dict[str, float]] = dict()

In [27]:
models_metrics["itemknn"] = calc_metrics_(candidates, "itemknn_rank")
models_metrics["itemknn"]

{'Precision@10': 0.024064893404754963,
 'recall@10': 0.07456844850217005,
 'ndcg@10': 0.028113827310066546,
 'map@10': 0.03318986888129458,
 'novelty@10': 4.341873763521282}

## Формируем датасет для ранкера

### Генерим фичи для ранкера

In [28]:
# Получаем длину истории юзера
base_models_data["user_hist"] = base_models_data.groupby("user_id")[
    "item_id"
].transform("count")
# Получаем популярность контента
base_models_data["item_pop"] = base_models_data.groupby("item_id")[
    "user_id"
].transform("count")
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data["user_avg_pop"] = base_models_data.groupby("user_id")[
    "item_pop"
].transform("mean")
# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data["item_avg_hist"] = base_models_data.groupby("item_id")[
    "user_hist"
].transform("mean")
# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime],
    ascending=[True, False],
    ignore_index=True,
    inplace=True,
)
base_models_data["user_last_pop"] = base_models_data.groupby("user_id")[
    "item_pop"
].transform("first")
base_models_data.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,user_hist,item_pop,user_avg_pop,item_avg_hist,user_last_pop
0,0,6006,2021-07-20,1,0.0,6,5208,41885.0,16.891897,5208
1,0,7102,2021-07-19,169,3.0,6,11626,41885.0,20.349475,5208
2,0,14359,2021-07-19,130,2.0,6,6053,41885.0,22.546836,5208


In [29]:
# Добавляем новые фичи в соответствующие таблички
items = pd.merge(
    left=items,
    right=(
        base_models_data[
            ["item_id", "item_pop", "item_avg_hist"]
        ].drop_duplicates()
    ),
    how="left",
    on="item_id",
)
items.head(3)

Unnamed: 0,item_id,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist
0,10711,0,2002.0,-1,16.0,-1,4.0,63.25
1,2508,0,2014.0,-1,16.0,-1,9.0,75.222222
2,10716,0,2011.0,-1,16.0,-1,6.0,65.666667


In [30]:
users = pd.merge(
    left=users,
    right=(
        base_models_data[
            ["user_id", "user_hist", "user_avg_pop", "user_last_pop"]
        ].drop_duplicates()
    ),
    how="left",
    on="user_id",
)
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop
0,973171,1,4,1,1,5.0,19550.8,93403.0
1,962099,0,2,1,0,13.0,1329.307692,260.0
2,1047345,3,3,0,0,,,


In [31]:
default_values_items["item_pop"] = base_models_data["item_pop"].median()
default_values_items["item_avg_hist"] = base_models_data[
    "item_avg_hist"
].median()

default_values_users = {
    "user_hist": 0,
    "user_avg_pop": base_models_data["user_avg_pop"].median(),
    "user_last_pop": base_models_data["user_last_pop"].median(),
}

In [32]:
# Сохранение данных для сервиса
for item in [
    "base_model_dataset",
    "users",
    "items",
    "default_values_users",
    "default_values_items",
    "interactions_default_values",
]:
    item_path = Path(f"../data/hw5/service/{item}.dill")
    item_path.parent.mkdir(parents=True, exist_ok=True)
    with item_path.open("wb") as f:
        dill.dump(locals()[item], f)

### Джойним кандидатов и юзер/айтем фичи

In [33]:
# Оставляем только тех, для кого есть и рекомы и таргеты


def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame,
    df: pd.DataFrame,
) -> pd.DataFrame:
    df = pd.merge(
        df[df["user_id"].isin(user_list)],
        candidates_df[candidates_df["user_id"].isin(user_list)],
        how="outer",  # right ?
        on=["user_id", "item_id"],
    )
    
    min_score: float = df["itemknn_score"].min() - 0.01
    max_rank: int = df["itemknn_rank"].max() + 1  # 101

    default_values = {
        "itemknn_score": min_score,
        "itemknn_rank": max_rank,
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)

    df.sort_values(
        by=["user_id", "item_id"],
        inplace=True,
    )
    return df


ranker_train = users_filter(train_users, candidates, ranker_data)
ranker_val = users_filter(val_users, candidates, ranker_data)
ranker_test = users_filter(test_users, candidates, ranker_data)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank
1180642,3,47,2021-08-16,2179.0,27.0,-0.01,101.0
998971,3,142,2021-08-13,5892.0,100.0,9422243.0,7.0
1067328,3,965,2021-08-16,5813.0,96.0,-0.01,101.0


In [34]:
# Добавляем фичи
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(df, users, how="left", on=["user_id"])
    df = pd.merge(df, items, how="left", on=["item_id"])

    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(-1)
            df.fillna({col: -1}, inplace=True)
    return df

ranker_train = add_features(ranker_train)
ranker_val = add_features(ranker_val)
ranker_test = add_features(ranker_test)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,user_hist,user_avg_pop,user_last_pop,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist
0,3,47,2021-08-16,2179.0,27.0,-0.01,101.0,-1,-1,-1,...,0.0,11957.864865,2858.0,0,2017.0,-1,18.0,-1,1249.0,38.180144
1,3,142,2021-08-13,5892.0,100.0,9422243.0,7.0,-1,-1,-1,...,0.0,11957.864865,2858.0,0,2020.0,-1,16.0,-1,35862.0,15.251464
2,3,965,2021-08-16,5813.0,96.0,-0.01,101.0,-1,-1,-1,...,0.0,11957.864865,2858.0,0,2018.0,-1,12.0,-1,536.0,29.130597


In [35]:
def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df["target_ranker"] = (df[Columns.Weight] >= 15).astype(
        int
    )
    df["target_ranker"] += (df[Columns.Weight] >= 75).astype(int)
    return df


ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,user_avg_pop,user_last_pop,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist,target_ranker
0,3,47,2021-08-16,2179.0,27.0,-0.01,101.0,-1,-1,-1,...,11957.864865,2858.0,0,2017.0,-1,18.0,-1,1249.0,38.180144,1
1,3,142,2021-08-13,5892.0,100.0,9422243.0,7.0,-1,-1,-1,...,11957.864865,2858.0,0,2020.0,-1,16.0,-1,35862.0,15.251464,2
2,3,965,2021-08-16,5813.0,96.0,-0.01,101.0,-1,-1,-1,...,11957.864865,2858.0,0,2018.0,-1,12.0,-1,536.0,29.130597,2


In [36]:
def filter_group(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(
        by=["user_id", "item_id"],
        inplace=True,
    )
    groups_df = (
        df[["user_id", "item_id"]]
        .groupby(by=["user_id"])
        .count()
        .rename(columns={"item_id": "group_size"})
    )
    df = pd.merge(df, groups_df, how="left", on=["user_id"])
    # Удаляем группы, без достаточного числа просмотров/кандидатов
    df = df[df["group_size"] >= 80]

    # Колонка больше не нужна
    df.drop(columns=["group_size"], inplace=True)
    return df


ranker_train = filter_group(ranker_train)
ranker_val = filter_group(ranker_val)
# ranker_test = filter_group(ranker_test)

In [37]:
ranker_train[cat_cols] = ranker_train[cat_cols].astype(int)
ranker_val[cat_cols] = ranker_val[cat_cols].astype(int)
ranker_test[cat_cols] = ranker_test[cat_cols].astype(int)

In [38]:
for name in ["train", "val", "test"]:
    path = Path(f"../data/hw5/ranker_{name}.csv")
    if not path.exists():
        locals()[f"ranker_{name}"].to_csv(path, index=False)

## Обучаем ранкер

In [39]:
def get_group(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[["user_id", "item_id"]].groupby(by=["user_id"]).count()["item_id"]
    )

In [40]:
# Загружаем данные
for name in ["train", "val", "test"]:
    if f"ranker_{name}" not in locals():
        path = Path(f"../data/hw5/ranker_{name}.csv")
        locals()[f"ranker_{name}"] = pd.read_csv(path)

In [41]:
params = {
    "objective": "lambdarank",  # lambdarank, оптимизирующий ndcg
    "n_estimators": 10000,  # максимальное число деревьев
    "max_depth": 4,  # максимальная глубина дерева
    "num_leaves": 10,  # число листьев << 2^max_depth
    "min_child_samples": 100,  # число примеров в листе
    "learning_rate": 0.25,  # шаг обучения
    "reg_lambda": 1,  # L2 регуляризация
    "colsample_bytree": 0.9,  # доля колонок, которая используется в каждом дереве
    "random_state": 42,
}
early_stopping_rounds = 32
fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target_ranker"],
    "group": get_group(ranker_train),
    "eval_set": [(ranker_val[cols], ranker_val["target_ranker"])],
    "eval_group": [get_group(ranker_val)],
    "eval_metric": "ndcg",
    "eval_at": (3, 5, 10),
    "early_stopping_rounds": early_stopping_rounds,
    "categorical_feature": cat_cols,
    "feature_name": cols,
    "verbose": early_stopping_rounds / 8,
}
listwise_model = LGBMRanker(**params)
listwise_model.fit(**fit_params)

[4]	valid_0's ndcg@3: 0.643962	valid_0's ndcg@5: 0.671561	valid_0's ndcg@10: 0.705344
[8]	valid_0's ndcg@3: 0.648764	valid_0's ndcg@5: 0.675321	valid_0's ndcg@10: 0.707564
[12]	valid_0's ndcg@3: 0.653477	valid_0's ndcg@5: 0.679437	valid_0's ndcg@10: 0.711789
[16]	valid_0's ndcg@3: 0.653453	valid_0's ndcg@5: 0.680424	valid_0's ndcg@10: 0.712568
[20]	valid_0's ndcg@3: 0.654044	valid_0's ndcg@5: 0.681304	valid_0's ndcg@10: 0.712904
[24]	valid_0's ndcg@3: 0.656378	valid_0's ndcg@5: 0.682488	valid_0's ndcg@10: 0.714243
[28]	valid_0's ndcg@3: 0.656949	valid_0's ndcg@5: 0.683125	valid_0's ndcg@10: 0.71468
[32]	valid_0's ndcg@3: 0.657173	valid_0's ndcg@5: 0.683054	valid_0's ndcg@10: 0.71482
[36]	valid_0's ndcg@3: 0.658554	valid_0's ndcg@5: 0.684274	valid_0's ndcg@10: 0.715916
[40]	valid_0's ndcg@3: 0.659535	valid_0's ndcg@5: 0.685213	valid_0's ndcg@10: 0.716689
[44]	valid_0's ndcg@3: 0.659659	valid_0's ndcg@5: 0.685705	valid_0's ndcg@10: 0.716864
[48]	valid_0's ndcg@3: 0.65922	valid_0's ndcg@5

In [42]:
# save model
listwise_model_path = Path("../data/hw5/models/listwise_model.dill")
listwise_model_path.parent.mkdir(parents=True, exist_ok=True)
with listwise_model_path.open("wb") as f:
    dill.dump(listwise_model, f)

In [43]:
listwise_model.best_score_["valid_0"]

OrderedDict([('ndcg@3', 0.6680439472549174),
             ('ndcg@5', 0.6924618561993277),
             ('ndcg@10', 0.7227483339229364)])

In [44]:
def add_score_and_rank(
    df: pd.DataFrame, y_pred_scores: np.ndarray, name: str
) -> pd.DataFrame:
    # Добавляем скор модели второго уровня
    df[f"{name}_score"] = y_pred_scores
    # Добавляем ранг модели второго уровня
    df.sort_values(
        by=["user_id", f"{name}_score"],
        ascending=[True, False],
        inplace=True,
    )
    df[f"{name}_rank"] = df.groupby("user_id").cumcount() + 1

    # Исключаем айтемы, которые не были предсказаны на первом уровне
    mask = (df["itemknn_rank"] < 101).to_numpy()
    # Добавляем общий скор двух-этапной модели
    eps: float = 0.001
    min_score: float = min(y_pred_scores) - eps
    df[f"{name}_hybrid_score"] = df[f"{name}_score"] * mask
    df[f"{name}_hybrid_score"].replace(
        0,
        min_score,
        inplace=True,
    )
    # Добавляем общий ранг двух-этапной модели
    df[f"{name}_hybrid_rank"] = df[f"{name}_rank"] * mask
    max_rank: int = 101
    df[f"{name}_hybrid_rank"].replace(
        0,
        max_rank,
        inplace=True,
    )
    return df

In [45]:
y_pred: np.ndarray = listwise_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, "listwise")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,for_kids,age_rating,studios,item_pop,item_avg_hist,target_ranker,listwise_score,listwise_rank,listwise_hybrid_score,listwise_hybrid_rank
0,1,3669,2021-08-16,1593.0,26.0,-0.01,101.0,1,2,0,...,-1,18.0,-1,2846.0,22.675978,1,1.760832,1,-6.825128,101
1,1,10440,2021-08-13,19579.0,80.0,-0.01,101.0,1,2,0,...,-1,18.0,-1,141889.0,8.068716,2,1.566693,2,-6.825128,101
2,12,846,2021-08-21,5929.0,18.0,-0.01,101.0,-1,-1,-1,...,0,16.0,-1,528.0,22.799242,1,1.272547,1,-6.825128,101


In [46]:
models_metrics["listwise"] = calc_metrics_(ranker_test, "listwise_rank")
models_metrics["listwise_hybrid"] = calc_metrics_(
    ranker_test, "listwise_hybrid_rank"
)
pd.DataFrame(models_metrics)[["listwise", "listwise_hybrid"]]

Unnamed: 0,listwise,listwise_hybrid
Precision@10,0.304603,0.02123
recall@10,0.923517,0.089499
ndcg@10,0.419433,0.021766
map@10,0.884992,0.03872
novelty@10,6.321721,4.045157


# Другие ранкеры

In [47]:
time_df = pd.DataFrame({})

## CatBoostRanker

In [48]:
params = {
    "objective": "YetiRank",
    "n_estimators": 10000,  # максимальное число деревьев
    "max_depth": 4,  # максимальная глубина дерева
    "min_child_samples": 100,  # число примеров в листе
    "learning_rate": 0.25,  # шаг обучения
    "reg_lambda": 1,  # L2 регуляризация
    "colsample_bylevel": 0.9,  # доля колонок, которая используется в каждом дереве
    "random_state": 42,
    "custom_metric": ["NDCG"],
}
val_pool = Pool(
    data=ranker_val[cols],
    label=ranker_val["target_ranker"],
    group_id=ranker_val["user_id"],
    cat_features=cat_cols,
)
early_stopping_rounds = 32
fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target_ranker"],
    "group_id": ranker_train["user_id"],
    "eval_set": val_pool,
    "early_stopping_rounds": early_stopping_rounds,
    "cat_features": cat_cols,
    "verbose": True,
    "metric_period": 4,
}
model_path = Path("../data/hw5/models/vs/CatBoostRanker.dill")
if model_path.exists():
    with model_path.open("rb") as f:
        cat_model = dill.load(f)
else:
    cat_model = CatBoostRanker(**params)
    cat_model.fit(**fit_params)

    model_path.parent.mkdir(parents=True, exist_ok=True)
    with model_path.open("wb") as f:
        dill.dump(cat_model, f)

Groupwise loss function. OneHotMaxSize set to 10




0:	test: 0.6769352	best: 0.6769352 (0)	total: 5.14s	remaining: 14h 17m 29s
4:	test: 0.7303472	best: 0.7303472 (4)	total: 24.8s	remaining: 13h 46m 53s
8:	test: 0.7614457	best: 0.7615734 (7)	total: 44.5s	remaining: 13h 42m 51s
12:	test: 0.7678331	best: 0.7678331 (12)	total: 1m 2s	remaining: 13h 19m 6s
16:	test: 0.7705992	best: 0.7706485 (15)	total: 1m 20s	remaining: 13h 5m 57s
20:	test: 0.7774719	best: 0.7777914 (18)	total: 1m 38s	remaining: 12h 57m 40s
24:	test: 0.7826914	best: 0.7826914 (24)	total: 1m 56s	remaining: 12h 53m 28s
28:	test: 0.7833201	best: 0.7833399 (27)	total: 2m 13s	remaining: 12h 47m 30s
32:	test: 0.7845780	best: 0.7845780 (32)	total: 2m 32s	remaining: 12h 46m 1s
36:	test: 0.7856355	best: 0.7856355 (36)	total: 2m 50s	remaining: 12h 43m 4s
40:	test: 0.7889349	best: 0.7889349 (40)	total: 3m 8s	remaining: 12h 42m 18s
44:	test: 0.7902652	best: 0.7902652 (44)	total: 3m 26s	remaining: 12h 41m 8s
48:	test: 0.7903105	best: 0.7903952 (47)	total: 3m 44s	remaining: 12h 41m 12s
52

In [49]:
start_time = datetime.datetime.now()
y_pred: np.ndarray = cat_model.predict(ranker_test[cols])
predict_time = datetime.datetime.now() - start_time

ranker_test = add_score_and_rank(ranker_test, y_pred, "CatBoostRanker")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,item_avg_hist,target_ranker,listwise_score,listwise_rank,listwise_hybrid_score,listwise_hybrid_rank,CatBoostRanker_score,CatBoostRanker_rank,CatBoostRanker_hybrid_score,CatBoostRanker_hybrid_rank
0,1,3669,2021-08-16,1593.0,26.0,-0.01,101.0,1,2,0,...,22.675978,1,1.760832,1,-6.825128,101,4.520329,1,-1.840654,101
1,1,10440,2021-08-13,19579.0,80.0,-0.01,101.0,1,2,0,...,8.068716,2,1.566693,2,-6.825128,101,3.678916,2,-1.840654,101
2,12,846,2021-08-21,5929.0,18.0,-0.01,101.0,-1,-1,-1,...,22.799242,1,1.272547,1,-6.825128,101,3.760585,1,-1.840654,101


In [50]:
time_df["CatBoostRanker"] = {
    "predict_time": predict_time,
}

In [51]:
models_metrics["CatBoostRanker"] = calc_metrics_(
    ranker_test, "CatBoostRanker_rank"
)
models_metrics["CatBoostRanker_hybrid"] = calc_metrics_(
    ranker_test, "CatBoostRanker_hybrid_rank"
)
pd.DataFrame(models_metrics)[["CatBoostRanker", "CatBoostRanker_hybrid"]]

Unnamed: 0,CatBoostRanker,CatBoostRanker_hybrid
Precision@10,0.303897,0.020725
recall@10,0.921008,0.087207
ndcg@10,0.418073,0.021208
map@10,0.880041,0.037315
novelty@10,6.333758,4.066542


## XGBRanker

In [52]:
def get_XGBRanker_group(df: pd.DataFrame) -> np.ndarray:
    return df.groupby("user_id").size().to_frame("size")["size"].to_numpy()

In [53]:
params = {
    "objective": "rank:pairwise",
    "n_estimators": 10000,  # максимальное число деревьев
    "max_depth": 4,  # максимальная глубина дерева
    "num_leaves": 10,  # число листьев << 2^max_depth
    "min_child_samples": 100,  # число примеров в листе
    "learning_rate": 0.25,  # шаг обучения
    "reg_lambda": 1,  # L2 регуляризация
    "colsample_bytree": 0.9,  # доля колонок, которая используется в каждом дереве
    "random_state": 42,
}
early_stopping_rounds = 32
fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target_ranker"],
    "group": get_XGBRanker_group(ranker_train),
    "eval_set": [(ranker_val[cols], ranker_val["target_ranker"])],
    "eval_group": [get_XGBRanker_group(ranker_val)],
    "eval_metric": "ndcg",
    "early_stopping_rounds": early_stopping_rounds,
    "verbose": early_stopping_rounds / 8,
}
model_path = Path("../data/hw5/models/vs/XGBRanker.dill")
if model_path.exists():
    with model_path.open("rb") as f:
        cat_model = dill.load(f)
else:
    xgb_model = XGBRanker(**params)
    xgb_model.fit(**fit_params)

    model_path.parent.mkdir(parents=True, exist_ok=True)
    with model_path.open("wb") as f:
        dill.dump(xgb_model, f)

Parameters: { "min_child_samples", "num_leaves" } are not used.

[0]	validation_0-ndcg:0.76450
[4]	validation_0-ndcg:0.76069
[8]	validation_0-ndcg:0.76615
[12]	validation_0-ndcg:0.76747
[16]	validation_0-ndcg:0.76797
[20]	validation_0-ndcg:0.76850
[24]	validation_0-ndcg:0.77014
[28]	validation_0-ndcg:0.77156
[32]	validation_0-ndcg:0.77247
[36]	validation_0-ndcg:0.77400
[40]	validation_0-ndcg:0.77600
[44]	validation_0-ndcg:0.77661
[48]	validation_0-ndcg:0.77643
[52]	validation_0-ndcg:0.77734
[56]	validation_0-ndcg:0.77748
[60]	validation_0-ndcg:0.77740
[64]	validation_0-ndcg:0.77782
[68]	validation_0-ndcg:0.77824
[72]	validation_0-ndcg:0.77844
[76]	validation_0-ndcg:0.77847
[80]	validation_0-ndcg:0.77854
[84]	validation_0-ndcg:0.77858
[88]	validation_0-ndcg:0.77893
[92]	validation_0-ndcg:0.77940
[96]	validation_0-ndcg:0.77926
[100]	validation_0-ndcg:0.77906
[104]	validation_0-ndcg:0.77928
[108]	validation_0-ndcg:0.77908
[112]	validation_0-ndcg:0.77899
[116]	validation_0-ndcg:0.77966
[12

In [54]:
start_time = datetime.datetime.now()
y_pred: np.ndarray = xgb_model.predict(ranker_test[cols])
predict_time = datetime.datetime.now() - start_time

ranker_test = add_score_and_rank(ranker_test, y_pred, "XGBRanker")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,listwise_hybrid_score,listwise_hybrid_rank,CatBoostRanker_score,CatBoostRanker_rank,CatBoostRanker_hybrid_score,CatBoostRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank
0,1,3669,2021-08-16,1593.0,26.0,-0.01,101.0,1,2,0,...,-6.825128,101,4.520329,1,-1.840654,101,3.797146,1,-4.297544,101
1,1,10440,2021-08-13,19579.0,80.0,-0.01,101.0,1,2,0,...,-6.825128,101,3.678916,2,-1.840654,101,3.183236,2,-4.297544,101
2,12,846,2021-08-21,5929.0,18.0,-0.01,101.0,-1,-1,-1,...,-6.825128,101,3.760585,1,-1.840654,101,3.21163,1,-4.297544,101


In [55]:
time_df["XGBRanker"] = {
    "predict_time": predict_time,
}

In [56]:
models_metrics["XGBRanker"] = calc_metrics_(ranker_test, "XGBRanker_rank")
models_metrics["XGBRanker_hybrid"] = calc_metrics_(
    ranker_test, "XGBRanker_hybrid_rank"
)
pd.DataFrame(models_metrics)[["XGBRanker", "XGBRanker_hybrid"]]

Unnamed: 0,XGBRanker,XGBRanker_hybrid
Precision@10,0.304351,0.021147
recall@10,0.922649,0.088812
ndcg@10,0.41868,0.021645
map@10,0.88204,0.038121
novelty@10,6.341096,4.071015


## LGBMRanker

In [57]:
params = {
    "objective": "lambdarank",  # lambdarank, оптимизирующий ndcg
    "n_estimators": 10000,  # максимальное число деревьев
    "max_depth": 4,  # максимальная глубина дерева
    "num_leaves": 10,  # число листьев << 2^max_depth
    "min_child_samples": 100,  # число примеров в листе
    "learning_rate": 0.25,  # шаг обучения
    "reg_lambda": 1,  # L2 регуляризация
    "colsample_bytree": 0.9,  # доля колонок, которая используется в каждом дереве
    "random_state": 42,
}
early_stopping_rounds = 32
fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target_ranker"],
    "group": get_group(ranker_train),
    "eval_set": [(ranker_val[cols], ranker_val["target_ranker"])],
    "eval_group": [get_group(ranker_val)],
    "eval_metric": "ndcg",
    "eval_at": (3, 5, 10),
    "early_stopping_rounds": early_stopping_rounds,
    "categorical_feature": cat_cols,
    "feature_name": cols,
    "verbose": early_stopping_rounds / 8,
}
model_path = Path("../data/hw5/models/vs/LGBMRanker.dill")
if model_path.exists():
    with model_path.open("rb") as f:
        cat_model = dill.load(f)
else:
    lgbm_model = LGBMRanker(**params)
    lgbm_model.fit(**fit_params)

    model_path.parent.mkdir(parents=True, exist_ok=True)
    with model_path.open("wb") as f:
        dill.dump(lgbm_model, f)

[4]	valid_0's ndcg@3: 0.643962	valid_0's ndcg@5: 0.671561	valid_0's ndcg@10: 0.705344
[8]	valid_0's ndcg@3: 0.648764	valid_0's ndcg@5: 0.675321	valid_0's ndcg@10: 0.707564
[12]	valid_0's ndcg@3: 0.653477	valid_0's ndcg@5: 0.679437	valid_0's ndcg@10: 0.711789
[16]	valid_0's ndcg@3: 0.653453	valid_0's ndcg@5: 0.680424	valid_0's ndcg@10: 0.712568
[20]	valid_0's ndcg@3: 0.654044	valid_0's ndcg@5: 0.681304	valid_0's ndcg@10: 0.712904
[24]	valid_0's ndcg@3: 0.656378	valid_0's ndcg@5: 0.682488	valid_0's ndcg@10: 0.714243
[28]	valid_0's ndcg@3: 0.656949	valid_0's ndcg@5: 0.683125	valid_0's ndcg@10: 0.71468
[32]	valid_0's ndcg@3: 0.657173	valid_0's ndcg@5: 0.683054	valid_0's ndcg@10: 0.71482
[36]	valid_0's ndcg@3: 0.658554	valid_0's ndcg@5: 0.684274	valid_0's ndcg@10: 0.715916
[40]	valid_0's ndcg@3: 0.659535	valid_0's ndcg@5: 0.685213	valid_0's ndcg@10: 0.716689
[44]	valid_0's ndcg@3: 0.659659	valid_0's ndcg@5: 0.685705	valid_0's ndcg@10: 0.716864
[48]	valid_0's ndcg@3: 0.65922	valid_0's ndcg@5

In [58]:
start_time = datetime.datetime.now()
y_pred: np.ndarray = lgbm_model.predict(ranker_test[cols])
predict_time = datetime.datetime.now() - start_time

ranker_test = add_score_and_rank(ranker_test, y_pred, "LGBMRanker")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,itemknn_score,itemknn_rank,age,income,sex,...,CatBoostRanker_hybrid_score,CatBoostRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank,LGBMRanker_score,LGBMRanker_rank,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank
0,1,3669,2021-08-16,1593.0,26.0,-0.01,101.0,1,2,0,...,-1.840654,101,3.797146,1,-4.297544,101,1.760832,1,-6.825128,101
1,1,10440,2021-08-13,19579.0,80.0,-0.01,101.0,1,2,0,...,-1.840654,101,3.183236,2,-4.297544,101,1.566693,2,-6.825128,101
2,12,846,2021-08-21,5929.0,18.0,-0.01,101.0,-1,-1,-1,...,-1.840654,101,3.21163,1,-4.297544,101,1.272547,1,-6.825128,101


In [59]:
time_df["LGBMRanker"] = {
    "predict_time": predict_time,
}

In [60]:
models_metrics["LGBMRanker"] = calc_metrics_(ranker_test, "LGBMRanker_rank")
models_metrics["LGBMRanker_hybrid"] = calc_metrics_(
    ranker_test, "LGBMRanker_hybrid_rank"
)
pd.DataFrame(models_metrics)[["LGBMRanker", "LGBMRanker_hybrid"]]

Unnamed: 0,LGBMRanker,LGBMRanker_hybrid
Precision@10,0.304603,0.02123
recall@10,0.923517,0.089499
ndcg@10,0.419433,0.021766
map@10,0.884992,0.03872
novelty@10,6.321717,4.045156


## Итоги

In [61]:
pd.DataFrame(models_metrics)[["LGBMRanker", "XGBRanker", "CatBoostRanker"]]

Unnamed: 0,LGBMRanker,XGBRanker,CatBoostRanker
Precision@10,0.304603,0.304351,0.303897
recall@10,0.923517,0.922649,0.921008
ndcg@10,0.419433,0.41868,0.418073
map@10,0.884992,0.88204,0.880041
novelty@10,6.321717,6.341096,6.333758


In [62]:
pd.DataFrame(models_metrics).loc[
    "ndcg@10", ["LGBMRanker", "XGBRanker", "CatBoostRanker"]
]

LGBMRanker        0.419433
XGBRanker         0.418680
CatBoostRanker    0.418073
Name: ndcg@10, dtype: float64

In [63]:
time_df

Unnamed: 0,CatBoostRanker,XGBRanker,LGBMRanker
predict_time,0 days 00:00:35.324463,0 days 00:00:03.640847,0 days 00:01:02.352432
