In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitALSWrapperModel, PopularModel, PureSVDModel, LightFMWrapperModel

from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

from lightfm import LightFM

import dill

import optuna

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# 🎬 Get KION dataset 

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>

In [2]:
# # download dataset by chunks
# url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

# req = requests.get(url, stream=True)

# with open('../data/kion_train.zip', "wb") as fd:
#     total_size_in_bytes = int(req.headers.get('Content-Length', 0))
#     progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2 ** 20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [3]:
# !cd ../data && unzip kion_train.zip && cd -

# EDA

In [4]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

## interactions

In [5]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [6]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [7]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [8]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [9]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


In [10]:
rec_interactions = Interactions(interactions)

users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
## users

In [11]:
users.fillna('Unknown', inplace=True)

In [12]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,Unknown,Unknown,Unknown,0
840195,590706,Unknown,Unknown,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [13]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
2,1047345,Ж,sex
3,721985,Ж,sex
4,704055,Ж,sex
...,...,...,...
840192,339025,income_0_20,income
840193,983617,income_20_40,income
840194,251008,Unknown,income
840195,590706,Unknown,income


In [14]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [15]:
pd.concat([items.head(3), items.tail(3)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Грандинетти, Джеральдин Чаплин, Елена Анайя, Каэтано Велозо, Леонор Уотлинг, Лола Дуэньяс, Лолес Леон, Малу Айродо, Мариола Фуэнтес, Пас Вега, Пина Бауш, Ро...",Мелодрама легендарного Педро Альмодовара «Поговори с ней» в 2003 году получила премию «Оскар» за лучший сценарий. Журналист Марко берет интервью у знаменитой женщины-тореро Лидии и вскоре влюбляе...,"Поговори, ней, 2002, Испания, друзья, любовь, сильные, женщины, преодоление, трудностей, отношения, дружба, отношения, паре, отношения, мужчины, женщины, романтические, отношения, потеря, близких,..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон Манцукас, Джон Глейсер, Карл Грин, Кристен Риттер, Лэнс Реддик, Морис Комт, Патрик Кернс, Ребекка Коллинз, Роза Салазар, Росс П. Кук, Стеффи Гроут, Ти...","Уморительная современная комедия на популярную тему о том, как не надо отмечать мальчишник. Главный герой усвоил, что не надо звать на свадьбу своего друга Джейсона, из-за которого он вместо сваде...","Голые, перцы, 2014, США, друзья, свадьбы, преодоление, трудностей, расставания, отношения, дружба, риск, недоразумение, мужская, дружба, мальчишники, девичники"
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман, Дэн Риззуто, Кендес Илэйн Калтраро, Кит Джардин, Лекса Дойг, Майкл Джей Уайт, Майкл Шэнкс, Майкл Эклунд, Питер Брайант, Питер Кент, Стив Бачич, Стив ...","Профессиональный рестлер Стив Остин («Все или ничего») и темнокожий мачо Майкл Джей Уайт («Темный рыцарь») в интригующем криминальном боевике. В центре сюжета – команда спецназовцев, которая оказа...","Тактическая, сила, 2011, Канада, бандиты, гангстеры, преступления, преодоление, трудностей, убийства, убийцы, настоящие, мужчины, риск, недоразумение, силы, правопорядка, борьба, за, выживание, сп..."
15960,10632,series,Сговор,Hassel,2017.0,"драмы, триллеры, криминал",Россия,0.0,18.0,,"Эшреф Рейбрук, Амир Камдин, Эрик Эгер","Ола Рапас, Алиетт Офейм, Уильма Лиден, Шанти Рони, Тома Холмин","Криминальная драма по мотивам романов о шведском детективе Роланде Хасселе. Средь бела дня убит полицейский, и нити в этом деле ведут прямо в коридоры власти. Расследованием занимается детектив Ха...","Сговор, 2017, Россия"
15961,4538,series,Среди камней,Darklands,2019.0,"драмы, спорт, криминал",Россия,0.0,18.0,,"Марк О’Коннор, Конор МакМахон","Дэйн Уайт О’Хара, Томас Кэйн-Бирн, Джудит Родди, Марк О’Халлоран, Джимми Смоллхорн","Семнадцатилетний Дэмиен мечтает вырваться за пределы своего района и стать профессиональным бойцом. Когда его кумир и старший брат исчезает, парень попадает в чуждый ему мир насилия, наркотиков и ...","Среди, камней, 2019, Россия"
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы не попасть в нелепую и курьёзную историю. Но даже неудачники мечтают о любви, и наш герой — не исключение, ведь оптимизма ему не занимать.","Гоша, 2019, Россия"


In [16]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


## Features

### Genre

In [17]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [18]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


### Year

In [19]:
year_feature = items.reindex(columns=[Columns.Item, "release_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "release_year"
year_feature = year_feature.dropna()
year_feature

Unnamed: 0,id,value,feature
0,10711,2002.0,release_year
1,2508,2014.0,release_year
2,10716,2011.0,release_year
3,7868,2015.0,release_year
4,16268,1978.0,release_year
...,...,...,...
15958,6443,2018.0,release_year
15959,2367,2020.0,release_year
15960,10632,2017.0,release_year
15961,4538,2019.0,release_year


### Countries

In [20]:
items["countries"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
countrie_feature = items[["item_id", "countries"]].explode("countries")
countrie_feature.columns = ["id", "value"]
countrie_feature["feature"] = "countrie"
countrie_feature = countrie_feature.dropna()

### Final features

In [21]:
item_features = pd.concat((genre_feature, content_feature, countrie_feature, year_feature))
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,2018.0,release_year
15959,2367,2020.0,release_year
15960,10632,2017.0,release_year
15961,4538,2019.0,release_year


# Metrics

In [22]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [23]:
item_features.isna()

Unnamed: 0,id,value,feature
0,False,False,False
0,False,False,False
0,False,False,False
0,False,False,False
1,False,False,False
...,...,...,...
15958,False,False,False
15959,False,False,False
15960,False,False,False
15961,False,False,False


# Models

In [24]:
K_RECOS = 10
N_EPOCHS = 2 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
NUM_THREADS = 8
RANDOM_STATE = 152

In [25]:
models = {
    "pure_svd": PureSVDModel,
    "als": AlternatingLeastSquares,
    "lightfm": LightFM,
}

# Split Data

In [26]:
# setting for cv 
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [27]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(rec_interactions)}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


In [28]:
fold_iterator = cv.split(rec_interactions, collect_fold_stats=True)
train_ids, test_ids, fold_info = next(iter(fold_iterator))

df_train = interactions.iloc[train_ids].copy()
df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

In [29]:
TEST_USERS = df_test[Columns.User].unique()
catalog = df_train[Columns.Item].unique()

item_features = item_features.loc[item_features["id"].isin(df_train[Columns.Item])].copy()
user_features = user_features.loc[user_features["id"].isin(df_train[Columns.User])].copy()

train_dataset  = Dataset.construct(
    interactions_df=df_train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "countrie"],
)

In [112]:
with open('../data/hw4_dataset.dill', 'wb') as f:
    dill.dump(train_dataset, f)

# Model training by fold

In [30]:
N_TRIALS = 20
results = [None] * N_TRIALS

In [31]:
def optimizer(trial):
    regressor_name = trial.suggest_categorical('regressor', models)
    if regressor_name == "pure_svd":
        factors = trial.suggest_int('factors', 4, 10)
        model = models[regressor_name](factors=factors)
    elif regressor_name == "lightfm":
        factors = trial.suggest_int('factors', 4, 10)
        loss = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
        model = LightFMWrapperModel(
            models[regressor_name](
                no_components=factors, 
                loss=loss, 
                random_state=RANDOM_STATE,
                learning_rate=learning_rate,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )
    else:
        factors = trial.suggest_int('factors', 4, 10)
        regularization = trial.suggest_float('regularization', 1e-5, 1e-2, log=True)
        model = ImplicitALSWrapperModel(model=models[regressor_name](factors=factors, regularization=regularization, num_threads=NUM_THREADS, random_state=RANDOM_STATE))
    model.fit(train_dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=train_dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )
    model_quality = {'model': regressor_name}
    model_quality.update(metric_values)
    results[trial.number] = model_quality
    return metric_values["map@10"]

In [32]:
study = optuna.create_study(direction='maximize')  # Create a new study.
study.optimize(optimizer, n_trials=N_TRIALS) 

[32m[I 2022-12-11 17:04:38,396][0m A new study created in memory with name: no-name-e8c0db1c-a0fe-48b9-9e23-b86a7927c2e6[0m
[32m[I 2022-12-11 17:05:47,992][0m Trial 0 finished with value: 0.00012261387814842676 and parameters: {'regressor': 'lightfm', 'factors': 5, 'loss': 'logistic', 'learning_rate': 0.08357284781726208}. Best is trial 0 with value: 0.00012261387814842676.[0m
[32m[I 2022-12-11 17:06:59,161][0m Trial 1 finished with value: 1.232626938934429e-05 and parameters: {'regressor': 'lightfm', 'factors': 10, 'loss': 'bpr', 'learning_rate': 3.996531197615023e-05}. Best is trial 0 with value: 0.00012261387814842676.[0m
[32m[I 2022-12-11 17:07:57,468][0m Trial 2 finished with value: 0.11738699206548243 and parameters: {'regressor': 'pure_svd', 'factors': 6}. Best is trial 2 with value: 0.11738699206548243.[0m
[32m[I 2022-12-11 17:08:55,676][0m Trial 3 finished with value: 0.11738302316355538 and parameters: {'regressor': 'pure_svd', 'factors': 6}. Best is trial 2 wit

# 👌 Metrics 

`Metrics by fold`



In [40]:
df_metrics = pd.DataFrame(results)
df_metrics.sort_values("map@10", ascending=False)

Unnamed: 0,model,prec@10,recall@10,map@10,novelty,serendipity
12,pure_svd,0.059872,0.362655,0.11852,5.490398,1.312511e-05
10,pure_svd,0.059872,0.362662,0.118516,5.490122,1.312497e-05
11,pure_svd,0.05987,0.362647,0.118515,5.49035,1.312102e-05
17,pure_svd,0.05987,0.362645,0.118514,5.490247,1.312102e-05
7,pure_svd,0.05987,0.362645,0.118512,5.49011,1.312095e-05
13,pure_svd,0.059616,0.361201,0.118487,5.498497,1.303961e-05
14,pure_svd,0.059617,0.361206,0.118484,5.498598,1.303954e-05
2,pure_svd,0.059989,0.363688,0.117387,5.480705,1.272558e-05
3,pure_svd,0.059986,0.363662,0.117383,5.480828,1.272142e-05
16,pure_svd,0.059986,0.363662,0.117382,5.480702,1.272134e-05


In [34]:
df_metrics.to_pickle("../data/df_metrics.pickle")

## Metrics mean by fold
`we can compare two models`

In [35]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
als,0.023266,0.138335,0.047115,7.460229,6.1e-05
lightfm,0.000238,0.001075,0.000229,15.95361,3e-06
pure_svd,0.059487,0.360633,0.117425,5.506075,1.3e-05


## Metrics std by fold

`If a diff between model metrics less than an std value => there is no significant difference observed`

- For instance, for the serendipity metric there is no such difference between cosine_itemknn and tfidf_itemknn model results

In [36]:
df_metrics.groupby('model').std()[metrics.keys()]

Unnamed: 0_level_0,prec@10,recall@10,map@10,novelty,serendipity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
als,0.002523,0.02101,0.011255,0.052621,3.162547e-06
lightfm,0.000286,0.001399,0.000271,1.204256,3.407256e-06
pure_svd,0.000868,0.004821,0.001808,0.040197,1.674661e-07


# Fit best model

In [46]:
model = PureSVDModel(factors=4)
model.fit(train_dataset)

<rectools.models.pure_svd.PureSVDModel at 0x7f18c90d5ac0>

In [47]:
with open('../data/hw4_model.dill', 'wb') as f:
    dill.dump(model, f)

# Approximate Nearest Neighbors

In [50]:
user_embeddings, item_embeddings = model.get_vectors()

In [56]:
from annoy import AnnoyIndex

In [59]:
f = user_embeddings.shape[1]  # Length of item vector that will be indexed

t = AnnoyIndex(f, 'euclidean')
for i, emb in enumerate(item_embeddings):
    t.add_item(i, emb)

In [60]:
t.build(20)

True

In [78]:
user_id = 155
internal_user_id = dataset.user_id_map.to_internal[user_id]
dataset.item_id_map.convert_to_external(t.get_nns_by_vector(user_embeddings[internal_user_id], 10)).tolist()

[11418, 10996, 2724, 770, 2891, 3283, 11082, 9083, 11490, 11915]

# Find popular for cold user
Ищем наиболее встречающиеся рекомендации из 100

In [109]:
from collections import Counter

In [111]:
counter = Counter()
for user in tqdm(user_embeddings):
    items = t.get_nns_by_vector(user_embeddings[internal_user_id], 100)
    counter.update(Counter(items))

  0%|          | 0/536802 [00:00<?, ?it/s]

In [118]:
popular_items = dataset.item_id_map.convert_to_external([item[0] for item in counter.most_common(100)]).tolist()
popular_items

[11418,
 10996,
 2724,
 770,
 2891,
 3283,
 11082,
 9083,
 11490,
 11915,
 15571,
 4647,
 2825,
 1636,
 12049,
 13472,
 11616,
 6840,
 857,
 6624,
 7295,
 5259,
 4730,
 7462,
 174,
 11285,
 6595,
 11969,
 13786,
 6942,
 5583,
 9302,
 13403,
 14766,
 6851,
 1605,
 8037,
 4586,
 5072,
 9675,
 15882,
 2871,
 14647,
 5137,
 8475,
 1615,
 15910,
 3672,
 15179,
 3236,
 13716,
 12712,
 9149,
 1788,
 15889,
 13975,
 11978,
 11907,
 15492,
 321,
 2565,
 8716,
 2690,
 15562,
 9926,
 8398,
 14286,
 14183,
 5783,
 10935,
 1806,
 3596,
 3177,
 11479,
 12638,
 4138,
 11784,
 12046,
 2518,
 57,
 257,
 8740,
 4276,
 843,
 5753,
 12511,
 9801,
 4834,
 406,
 12469,
 16117,
 5701,
 12519,
 9795,
 7938,
 9470,
 11839,
 12384,
 6413,
 12732]

In [120]:
with open('../data/hw4_popular.dill', 'wb') as f:
    dill.dump(popular_items, f)