In [None]:
!pip install shap
!pip install lightfm
!pip install rectools
!pip install catboost



In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import requests
import shap
import zipfile as zf
import pickle

from typing import Callable, Dict, Set, List, Optional, Any, Tuple
from scipy.sparse import csr_matrix
from lightfm import LightFM
from rectools.dataset import Dataset
from lightgbm import LGBMRanker, LGBMClassifier
from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from rectools import Columns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from catboost import CatBoostRanker, Pool


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:

cols = [
    'score_lightfm', 'rank_lightfm', 'score_pop', 'rank_pop',
    'age', 'income', 'sex', 'kids_flg', 'user_hist', 'user_avg_pop', 'user_last_pop',
    'content_type', 'release_year', 'for_kids', 'age_rating', 'studios', 'item_pop', 'item_avg_hist',
]
cat_cols = [
    'age', 'income', 'sex', 'kids_flg',
    'content_type', 'for_kids', 'studios',
]

In [None]:
for name in ["train", "val", "test"]:
    path: str = f"/content/drive/ranker_{name}.csv"
    locals()[f"ranker_{name}"] = pd.read_csv(path)

In [3]:
def add_target(df: pd.DataFrame) -> pd.DataFrame:

    df['target_ranker'] = (df[Columns.Weight] >= 15).astype(int)
    df['target_ranker'] += (df[Columns.Weight] >= 75).astype(int)
    return df

ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

In [None]:
X_train = ranker_train[cols]
y_train = ranker_train['target_ranker']

X_val = ranker_val[cols]
y_val = ranker_val['target_ranker']

train_pool = Pool(X_train, label=y_train, group_id= ranker_train['user_id'])
val_pool = Pool(X_val, label=y_val, group_id= ranker_val['user_id'])

params = {
    'iterations': 1,
    'learning_rate': 0.12,
    'depth': 8,
    'loss_function': 'YetiRank',
    'custom_metric': ['NDCG:top=10'],
    'random_seed': 42,
}

catboost_ranker_model = CatBoostRanker(**params)
catboost_ranker_model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7963706	best: 0.7963706 (0)	total: 9.68s	remaining: 1m 27s
1:	test: 0.7976028	best: 0.7976028 (1)	total: 20.8s	remaining: 1m 23s
2:	test: 0.8000425	best: 0.8000425 (2)	total: 32.2s	remaining: 1m 15s
3:	test: 0.8001674	best: 0.8001674 (3)	total: 41.1s	remaining: 1m 1s
4:	test: 0.8003456	best: 0.8003456 (4)	total: 53.2s	remaining: 53.2s
5:	test: 0.8011107	best: 0.8011107 (5)	total: 1m 4s	remaining: 43.1s
6:	test: 0.8011167	best: 0.8011167 (6)	total: 1m 13s	remaining: 31.6s
7:	test: 0.8015994	best: 0.8015994 (7)	total: 1m 24s	remaining: 21.2s
8:	test: 0.8144637	best: 0.8144637 (8)	total: 1m 37s	remaining: 10.8s
9:	test: 0.8144715	best: 0.8144715 (9)	total: 1m 49s	remaining: 0us

bestTest = 0.8144714803
bestIteration = 9



<catboost.core.CatBoostRanker at 0x7abf18126ec0>

In [None]:
ranker = pd.concat([ranker_train,ranker_val,ranker_test])

In [None]:
ranker_sorted = ranker.sort_values(by=['user_id'])

ranker_pool = Pool(ranker_sorted[cols], group_id=ranker_sorted['user_id'])

preds = catboost_ranker_model.predict(ranker_pool)


In [None]:
preds