In [1]:
import pandas as pd
import numpy as np
import optuna


from rectools.metrics import MAP, calc_metrics
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel
from pathlib import Path

from lightfm import LightFM


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import requests
# from tqdm import tqdm

# url = "https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip"

# req = requests.get(url, stream=True)

# with open("kion.zip", "wb") as fd:
#     total_size_in_bytes = int(req.headers.get("Content-Length", 0))
#     progress_bar = tqdm(desc="kion dataset download", total=total_size_in_bytes, unit="iB", unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2**20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

# import zipfile as zf

# files = zf.ZipFile("kion.zip", "r")
# files.extractall()
# files.close()

In [3]:
DATA_PATH = Path("../data")
Columns.Datetime = "last_watch_dt"

users = pd.read_csv(DATA_PATH / "users.csv")
items = pd.read_csv(DATA_PATH / "items.csv")
interactions = pd.read_csv(DATA_PATH / "interactions.csv")


In [4]:
users.head(3)


Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [5]:
items.head(3)


Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [6]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
interactions[Columns.Weight] = interactions["total_dur"]
max_date = interactions[Columns.Datetime].max()
# train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
# test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

# print(f"train: {train.shape}")
# print(f"test: {test.shape}")


In [7]:
# train.drop(train.query("total_dur < 300").index, inplace=True)
# cold_users = set(test[Columns.User]) - set(train[Columns.User])
# test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [8]:
interactions


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,4250
1,699317,1659,2021-05-29,8317,100.0,8317
2,656683,7107,2021-05-09,10,0.0,10
3,864613,7638,2021-07-05,14483,100.0,14483
4,964868,9506,2021-04-30,6725,100.0,6725
...,...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0,76
5476247,546862,9673,2021-04-13,2308,49.0,2308
5476248,697262,15297,2021-08-20,18307,63.0,18307
5476249,384202,16197,2021-04-19,6203,100.0,6203


# USER FEATURES

In [9]:
users.isnull().sum() / len(users)

user_id     0.000000
age         0.016776
income      0.017586
sex         0.016462
kids_flg    0.000000
dtype: float64

In [10]:
users.fillna("Unknown", inplace=True)
users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


# ITEM FEATURES

In [11]:
items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

items["director"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
director_feature = items[["item_id", "director"]].explode("director")
director_feature.columns = ["id", "value"]
director_feature["feature"] = "director"

items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"

year_feature = items.reindex(columns=[Columns.Item, "release_year"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "release_year"

item_features = pd.concat((genre_feature, content_feature, country_feature, year_feature, director_feature))
item_features.head()


Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [12]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "director", "country", "release_year"],
)

In [13]:
K_RECOS = 10

metrics_name = {
    "MAP": MAP,
}
metrics = {f"{metric_name}@{K_RECOS}": metric(k=K_RECOS) for metric_name, metric in metrics_name.items()}


In [14]:
def objective(trial, dataset, train, test):
    # loss = trial.suggest_categorical("loss", ("warp", "bpr", "logistic"))
    no_components = trial.suggest_categorical("no_components", (4, 6, 8, 10))
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1.0, log=True)
    epoch = trial.suggest_int("epoch", 1, 10)

    lightfm_model = LightFM(no_components=no_components, loss="warp", learning_rate=learning_rate, random_state=4)

    model = LightFMWrapperModel(
        model=lightfm_model,
        epochs=epoch,
    )

    model.fit(dataset)
    recs = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metrics_vals = calc_metrics(metrics, recs, test, train)
    return metrics_vals["MAP@10"]


study = optuna.create_study(direction="maximize")

[I 2023-12-13 04:57:27,656] A new study created in memory with name: no-name-b7793a9f-56ee-4c40-9630-ad66c893f41a


In [15]:
study.optimize(lambda trial: objective(trial, dataset, train, test), n_trials=10)

[I 2023-12-13 04:58:20,558] Trial 0 finished with value: 0.07593592619463373 and parameters: {'no_components': 10, 'learning_rate': 0.000589882634985533, 'epoch': 2}. Best is trial 0 with value: 0.07593592619463373.
[I 2023-12-13 04:59:47,649] Trial 1 finished with value: 0.07411841745948808 and parameters: {'no_components': 8, 'learning_rate': 0.0004772802182185339, 'epoch': 8}. Best is trial 0 with value: 0.07593592619463373.
[I 2023-12-13 05:00:27,116] Trial 2 finished with value: 0.06965727651484889 and parameters: {'no_components': 4, 'learning_rate': 0.0821819959886023, 'epoch': 2}. Best is trial 0 with value: 0.07593592619463373.
[I 2023-12-13 05:01:42,961] Trial 3 finished with value: 0.0745685609618316 and parameters: {'no_components': 6, 'learning_rate': 0.0011223713674549185, 'epoch': 6}. Best is trial 0 with value: 0.07593592619463373.
[I 2023-12-13 05:02:21,605] Trial 4 finished with value: 0.07521844373503851 and parameters: {'no_components': 10, 'learning_rate': 0.004379

In [37]:
best_params = study.best_params
best_value = study.best_value
print(best_params, best_value)

NameError: name 'study' is not defined

In [14]:
lightfm_model = LightFM(no_components=4, loss='warp', learning_rate=0.012, random_state=42)

model = LightFMWrapperModel(
    model=lightfm_model,
    epochs=5,
)
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fac1b7458a0>

In [15]:
lfm_recs = model.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False,
)

lfm_recs.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,9728,-130.629166,1
1,176549,13865,-130.967178,2
2,176549,10440,-131.164368,3
3,176549,3734,-131.397644,4
4,176549,15297,-131.429977,5


In [16]:
lfm_recs = (
    lfm_recs.groupby(["user_id"])
    .agg({"item_id": lambda x: x.tolist()})
    .reset_index()
    .set_index("user_id")
    .to_dict()["item_id"]
)

In [17]:
lfm_recs

{0: [15297, 10440, 4151, 13865, 4880, 3734, 9728, 2657, 12192, 9996],
 1: [10440, 15297, 9728, 13865, 3734, 4151, 4880, 2657, 7571, 142],
 2: [9728, 10440, 13865, 15297, 4151, 3734, 4880, 7571, 7829, 2657],
 3: [10440, 15297, 4151, 2657, 13865, 9728, 3734, 4880, 6809, 12192],
 4: [10440, 15297, 9728, 13865, 4151, 3734, 4880, 2657, 9996, 6809],
 5: [10440, 9728, 15297, 13865, 2657, 142, 4740, 6809, 3734, 4151],
 7: [15297, 10440, 4151, 2657, 4880, 13865, 3734, 12192, 9996, 9728],
 8: [10440, 15297, 9728, 13865, 4151, 3734, 4880, 2657, 7571, 6809],
 9: [10440, 15297, 4151, 9728, 3734, 13865, 4880, 7571, 2657, 12192],
 10: [10440, 9728, 13865, 15297, 4151, 3734, 4880, 2657, 6809, 7571],
 11: [15297, 10440, 4151, 2657, 13865, 12192, 4880, 3734, 9996, 9728],
 12: [15297, 10440, 2657, 4151, 13865, 3734, 4880, 12192, 9728, 9996],
 13: [10440, 15297, 13865, 4151, 9728, 3734, 4880, 2657, 12192, 9996],
 14: [10440, 15297, 2657, 4151, 3734, 13865, 4880, 9728, 12192, 16228],
 15: [10440, 15297, 26

In [18]:
import pickle

with open('../data/lfm_dataset.pkl', 'wb') as file: 
    pickle.dump(dataset, file) 

In [19]:
with open("../data/lfm_recs.pkl", "wb") as file:
    pickle.dump(lfm_recs, file)

In [20]:
with open('../models/lightfm.pkl', 'wb') as file: 
    pickle.dump(model, file) 

In [21]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [22]:
user_embeddings.shape, item_embeddings.shape

((962179, 6), (15706, 6))

In [23]:
import nmslib
from rectools.tools import UserToItemAnnRecommender


In [24]:
alsup = UserToItemAnnRecommender(
    user_vectors=user_embeddings,
    item_vectors=item_embeddings,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
alsup.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7fab9a2f3400>

In [25]:
dataset.user_id_map.external_ids

array([176549, 699317, 656683, ..., 805174, 648596, 697262])

In [26]:
USER_CHECKOUT = 176549
ITEMS_CHECKOUT = alsup.get_item_list_for_user(176549, top_n=10)

In [27]:
list(alsup.get_item_list_for_user(123, top_n=10))

[15297, 10440, 9728, 4151, 10732, 2657, 7571, 15266, 3182, 13865]

In [28]:
# RECOMMENDS
items[items['item_id'].isin(ITEMS_CHECKOUT)][['item_id', 'title', 'genres']]

Unnamed: 0,item_id,title,genres
767,15297,Клиника счастья,"драмы, мелодрамы"
1849,7571,100% волк,"мультфильм, приключения, семейное, фэнтези, ко..."
1882,10440,Хрустальный,"триллеры, детективы"
2835,10732,Губка Боб Квадратные Штаны,"мультсериалы, приключения, комедии"
3951,3182,Ральф против Интернета,"мультфильм, приключения, фантастика, семейное,..."
6346,9728,Гнев человеческий,"боевики, триллеры"
6501,13865,Девятаев,"драмы, военные, приключения"
6689,4151,Секреты семейной жизни,комедии
12035,15266,Корпорация монстров,"мультфильм, фэнтези, приключения, комедии"
13615,2657,Подслушано,"драмы, триллеры"


In [29]:
items[items['item_id'].isin(interactions[interactions['user_id'] == USER_CHECKOUT]['item_id'].unique())][['item_id', 'title', 'genres']]

Unnamed: 0,item_id,title,genres
57,149,Разлом времени,"фантастика, триллеры, детективы"
440,7453,На линии огня,"боевики, триллеры"
537,5533,Титаник,"драмы, историческое, триллеры, мелодрамы"
723,6870,Красавица и чудовище,"драмы, фэнтези, музыкальные"
906,10881,Быстрее пули,"боевики, драмы, триллеры, криминал"
...,...,...,...
14958,7102,Дочь волка,"боевики, триллеры"
15223,15531,Секса много не бывает,"драмы, комедии"
15352,7582,Холодное сердце II,"фэнтези, мультфильм, музыкальные"
15569,12250,Джон Уик 2,"боевики, триллеры, криминал"


## Online

In [30]:
with open('../models/ann_lightfm.pkl', 'wb') as file: 
    pickle.dump(alsup, file) 

In [31]:
del alsup

In [32]:
with open('../models/ann_lightfm.pkl', 'rb') as file: 
    alsup = pickle.load(file) 

In [33]:
%%time
alsup.get_item_list_for_user(176549, top_n=10)

CPU times: user 49.1 ms, sys: 1.61 ms, total: 50.7 ms
Wall time: 48.4 ms


array([15297, 10440,  9728,  4151, 10732,  2657,  7571, 15266,  3182,
       13865])

# COLD START

In [34]:
cold_users = list(lfm_recs.keys())

with open('../models/cold_users.pkl', 'wb') as file: 
    pickle.dump(cold_users, file)

In [37]:
popular = interactions[interactions['watched_pct'] >= 70].groupby(Columns.Item)[Columns.User].count().sort_values(ascending=False).head(10).to_list()

In [38]:
with open('../models/popular.pkl', 'wb') as file: 
    pickle.dump(popular, file)