# Dependencies

In [1]:
import os

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import dill

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

import nmslib

# 1. Data

In [5]:
DATA_PATH = Path("../Lecture 3. Baselines and kNN/kion_train")

users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

## 1.1. Preprocess

In [6]:
# interactions preprocess
Columns.Datetime = 'last_watch_dt'

interactions.drop(
    interactions[
        interactions[Columns.Datetime].str.len() != 10
    ].index,
    inplace=True
)

interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime],
    format='%Y-%m-%d'
)

interactions[Columns.Weight] = np.where(
    interactions['watched_pct'] > 10, 3, 1
)

## 1.2. Train test split

In [7]:
max_date = interactions[Columns.Datetime].max()
split_point = max_date - pd.Timedelta(days=7)


train = interactions[
    interactions[Columns.Datetime] < split_point
].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)


test = interactions[
    interactions[Columns.Datetime] >= split_point
].copy()
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(
    test[test[Columns.User].isin(cold_users)].index,
    inplace=True
)


print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 6)
test: (333026, 6)


## 1.3. User features

In [8]:
# preprocess
users.fillna('Unknown', inplace=True)
users = users.loc[
    users[Columns.User].isin(train[Columns.User])
].copy()

# features
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## 1.4. Item features

In [9]:
items = items.loc[
    items[Columns.Item].isin(train[Columns.Item])
].copy()

# Explode genres to flatten table
items["genre"] = (
    items["genres"].str.lower()
    .str.replace(", ", ",", regex=False)
    .str.split(",")
)
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

# Content
content_feature = items.reindex(
    columns=[Columns.Item, "content_type"]
)
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

# Al features
item_features = pd.concat((genre_feature, content_feature))
item_features.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


# 2. Metrics

In [10]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11, 3):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

# 3. Models

Будем рассматривать модели с учетом признаков. В качестве гиперпараметров переберем:
* Размерность эмбеддингов
* Коэффициенты регуляризации
* Количество эпох

## 3.1. Initialization

In [11]:
K_RECOS = 10
RANDOM_STATE = 16
NUM_THREADS = 8
N_FACTORS = [8, 12]

# Implicit:
REGULARIZATION = [0.01, 0.05]

# Lightfm:
N_EPOCHS = [1, 3]
USER_ALPHA = [0.01, 0.05]
ITEM_ALPHA = [0.01, 0.05]
LEARNING_RATE = 0.05

In [12]:
models = {
    'popular': PopularModel(),
}

Рассмотрим модель ALS:

In [13]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}

for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            for reg in REGULARIZATION:
                models[
                    f"{implicit_name}_{n_factors}_{is_fitting_features}_{reg}"
                      ] = (
                    ImplicitALSWrapperModel(
                        model=implicit_model(
                            factors=n_factors, 
                            random_state=RANDOM_STATE, 
                            num_threads=NUM_THREADS,
                            regularization=reg
                        ),
                        fit_features_together=is_fitting_features,
                    )
                )

И рассмотрим модель на основе warp loss:

In [14]:
lightfm_losses = ('warp',)

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        for n_epochs in N_EPOCHS:
            for u_alpha in USER_ALPHA:
                for i_alpha in ITEM_ALPHA:
                    models[
                        f"LightFM_{loss}_{n_factors}_{n_epochs}_{u_alpha}_{i_alpha}"
                    ] = LightFMWrapperModel(
                        LightFM(
                            no_components=n_factors,
                            loss=loss,
                            random_state=RANDOM_STATE,
                            learning_rate=LEARNING_RATE,
                            user_alpha=u_alpha,
                            item_alpha=i_alpha,
                        ),
                        epochs=n_epochs,
                        num_threads=NUM_THREADS,
                    )

## 3.2. Fit

In [86]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [87]:
TEST_USERS = test[Columns.User].unique()

In [225]:
results = []
iters = tqdm(models.items(), total=len(models.items()))
for model_name, model in iters:
    if model_name in [r['model'] for r in results]:
        continue
        
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)
    del recos
    
    # with open('fit_results.dill', 'wb') as f:
    #     dill.dump(results, f)

    break

  0%|                                                                             | 0/25 [00:00<?, ?it/s]

Fitting model popular...


  0%|                                                                             | 0/25 [00:20<?, ?it/s]


In [18]:
df_quality = pd.DataFrame(results).T
df_quality.columns = df_quality.iloc[0]
df_quality.drop('model', inplace=True)
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_8_True_0.01,ALS_8_True_0.05,ALS_12_True_0.01,ALS_12_True_0.05,ALS_8_False_0.01,ALS_8_False_0.05,ALS_12_False_0.01,ALS_12_False_0.05,LightFM_warp_8_1_0.01_0.01,LightFM_warp_8_1_0.01_0.05,LightFM_warp_8_1_0.05_0.01,LightFM_warp_8_1_0.05_0.05,LightFM_warp_8_3_0.01_0.01,LightFM_warp_8_3_0.01_0.05,LightFM_warp_8_3_0.05_0.01,LightFM_warp_8_3_0.05_0.05,LightFM_warp_12_1_0.01_0.01,LightFM_warp_12_1_0.01_0.05,LightFM_warp_12_1_0.05_0.01,LightFM_warp_12_1_0.05_0.05,LightFM_warp_12_3_0.01_0.01,LightFM_warp_12_3_0.01_0.05,LightFM_warp_12_3_0.05_0.01,LightFM_warp_12_3_0.05_0.05
Precision@1,0.069368,0.081866,0.081229,0.080884,0.08099,0.068518,0.068554,0.063172,0.063119,0.040052,0.058392,0.076131,0.074378,0.075599,0.074351,0.069386,0.074316,0.058171,0.039778,0.074316,0.076086,0.075936,0.075971,0.076095,0.074378
Recall@1,0.035863,0.04225,0.041695,0.041371,0.041456,0.033405,0.033423,0.031148,0.031102,0.019392,0.031178,0.040277,0.039099,0.039745,0.03909,0.035872,0.039083,0.028956,0.022456,0.039095,0.040212,0.039867,0.040168,0.040207,0.039099
Precision@4,0.057348,0.052955,0.052553,0.052265,0.051732,0.049408,0.049413,0.047193,0.047209,0.052597,0.047463,0.050072,0.057222,0.056671,0.057007,0.056689,0.057091,0.056943,0.035228,0.052323,0.052188,0.055458,0.051949,0.052309,0.057215
Recall@4,0.112881,0.102997,0.102069,0.101571,0.1002,0.094324,0.094358,0.089689,0.089668,0.103295,0.094449,0.098227,0.112695,0.110916,0.112337,0.111596,0.112486,0.111947,0.072469,0.102809,0.102609,0.109372,0.102249,0.102838,0.112681
Precision@7,0.041242,0.038453,0.038459,0.038276,0.038019,0.039343,0.039351,0.038565,0.03861,0.042456,0.035077,0.040608,0.040551,0.040283,0.039647,0.041032,0.040255,0.041787,0.027682,0.04107,0.040369,0.040687,0.040207,0.040791,0.040649
Recall@7,0.13839,0.127212,0.127023,0.126427,0.125432,0.12905,0.129084,0.125316,0.125465,0.142895,0.118869,0.136256,0.136394,0.135319,0.133442,0.137208,0.135237,0.140236,0.096086,0.137546,0.135566,0.136573,0.135183,0.137016,0.136515
Precision@10,0.032803,0.030882,0.031006,0.030874,0.030836,0.033347,0.033355,0.032994,0.033019,0.035335,0.028124,0.031864,0.032071,0.031434,0.028982,0.032541,0.031359,0.034218,0.023127,0.032036,0.031599,0.032046,0.031731,0.032442,0.032012
Recall@10,0.15607,0.143266,0.143778,0.143102,0.143014,0.153344,0.153377,0.150453,0.150612,0.16926,0.13439,0.1516,0.153496,0.149031,0.138006,0.152435,0.148752,0.162752,0.112871,0.151296,0.150607,0.152545,0.150535,0.15433,0.152263
MAP@1,0.035863,0.04225,0.041695,0.041371,0.041456,0.033405,0.033423,0.031148,0.031102,0.019392,0.031178,0.040277,0.039099,0.039745,0.03909,0.035872,0.039083,0.028956,0.022456,0.039095,0.040212,0.039867,0.040168,0.040207,0.039099
MAP@4,0.066049,0.067475,0.066767,0.066526,0.06616,0.057711,0.057735,0.054363,0.054353,0.051449,0.055519,0.064042,0.067598,0.068418,0.067489,0.065629,0.067528,0.06039,0.04154,0.064758,0.065211,0.068458,0.065061,0.065276,0.067596


В качестве результирующей модели рассмотрим модель на основе LightFM и warp с параметрами:
* `loss`: `warp`
* `no_components`: `12`
* `epochs`: `3`
* `user_alpha`: `0.01`
* `item_alpha`: `0.01`

Данная модель показывает лучшее качество по метрике MAP с увеличением $k$.

In [16]:
MF_final_model = LightFMWrapperModel(
    LightFM(
        no_components=12,
        loss='warp',
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        user_alpha=0.01,
        item_alpha=0.01,
    ),
    epochs=3,
    num_threads=NUM_THREADS,
)

In [17]:
MF_final_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f6ee01cf850>

In [91]:
popular_model = PopularModel()
popular_model.fit(dataset)
popular_items_all = popular_model.recommend(
    users=TEST_USERS[:1],
    dataset=dataset,
    k=train['item_id'].shape[0],
    filter_viewed=True,
)['item_id'].tolist()

In [93]:
popular_items_all[:10]

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 142, 6809, 8636]

# 4. HNSW algorithm

## 4.1. Implementation

In [23]:
user_embeds, item_embeds = MF_final_model.get_vectors(dataset)

In [24]:
print(f'user_embeds: {user_embeds.shape}')
print(f'item_embeds: {item_embeds.shape}')

user_embeds: (756562, 14)
item_embeds: (14019, 14)


In [18]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(
        max_norm ** 2 - normed_factors ** 2
    ).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors


def augment_usres(factors):
    extra_zero = np.zeros((factors.shape[0], 1))
    return np.append(factors, extra_zero, axis=1)


def augments_embeds(ue, ie):
    _, item_embeddings_aug = augment_inner_product(ie)
    users_embeddings_aug = augment_usres(ue)
    return users_embeddings_aug, item_embeddings_aug

In [26]:
user_embeds_aug, item_embeds_aug = augments_embeds(
    user_embeds,
    item_embeds
)

In [27]:
print(f'user_embeds: {user_embeds_aug.shape}')
print(f'item_embeds: {item_embeds_aug.shape}')

user_embeds: (756562, 15)
item_embeds: (14019, 15)


In [99]:
M = 48
efC = 100
num_threads = 4

index_time_params = {
    'M': M,
    'indexThreadQty': num_threads,
    'efConstruction': efC,
}


method = 'hnsw'
space_name = 'negdotprod'
data_type = nmslib.DataType.DENSE_VECTOR

index_params = {
    'method': method,
    'space_name': space_name,
    'data_type': data_type
}


embeddings = {
    'user_embeds_aug': user_embeds_aug,
    'item_embeds_aug': item_embeds_aug,
}

In [101]:
from typing import Tuple, Union


class HNSW:
    def __init__(
        self,
        index_time_params: dict,
        index_params: dict,
        embeddings: dict
    ):
        # Params:
        self.index_time_params = index_time_params
        self.index_params = index_params
        # Index:
        self.index = nmslib.init(
            method = index_params['method'],
            space = index_params['space_name'],
            data_type = index_params['data_type']
        )
        self.index.addDataPointBatch(embeddings['item_embeds_aug'])
        self.index.createIndex(index_time_params)
        # Query params:
        self.index.setQueryTimeParams(
            {'efSearch': index_time_params['efConstruction']}
        )
    
    def query(
        self,
        query: np.array,
        k:int = 10,
        num_threads: int = -1
    ) -> Tuple[np.array, np.array]:
        if query.ndim == 1:
            return self.index.knnQuery(query, k=k)
        else:
            pred = self.index.knnQueryBatch(
                query,
                k=k,
                num_threads=num_threads,
            )
            items, scores = list(zip(*pred))
            return np.array(items), np.array(scores)

In [195]:
hnsw = HNSW(index_time_params, index_params, embeddings)

In [196]:
predict = hnsw.query(
    user_embeds_aug[:5, :],
    k=3,
    num_threads=4
)

In [199]:
predict

(array([[31, 19, 43],
        [31, 19, 43],
        [31, 19, 43],
        [31, 19, 43],
        [31, 19, 43]], dtype=int32),
 array([[ 0.11126111,  0.11154806,  0.11174665],
        [ 0.1113352 ,  0.11162215,  0.11182075],
        [ 0.103005  ,  0.10329195,  0.10349055],
        [ 0.12341713,  0.12370408,  0.12390268],
        [-0.00116866, -0.00088171, -0.00068312]], dtype=float32))

In [201]:
pred_items, pred_scores = hnsw.query(
    user_embeds_aug[10],
    k=3,
    num_threads=4
)

pred_items, pred_scores

(array([31, 19, 43], dtype=int32),
 array([-0.00116406, -0.0008771 , -0.00067851], dtype=float32))

Видно, что предсказания представляют из себя одни и те же айтемы:

In [335]:
pred_items, _ = hnsw.query(
    user_embeds_aug,
    k=10,
    num_threads=4
)

dataset.item_id_map.convert_to_external(np.unique(pred_items))

array([10440, 15297, 13865,  9728,  3734,  6809,  4880,  4151,  2657,
        4740])

In [94]:
popular_items_all[:10]

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 142, 6809, 8636]

Таким образом получилось, что даже с учетом регуляризации, все рекомендации это немного измененное популярное и работает это лучше, чем популярное в контексте, например, $MAP@10$.

## 4.2. Evaluation

In [202]:
# MF
mf_recos = MF_final_model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

mf_quality = {'model': 'clear MF'}
mf_metrics = calc_metrics(metrics, mf_recos, test, train)
mf_quality.update(mf_metrics)

In [210]:
# MF + HNSW
test_embeds = user_embeds_aug[
    dataset.user_id_map.convert_to_internal(TEST_USERS)
]

pred_items, _ = hnsw.query(test_embeds, k=10, num_threads=8)

hnsw_recos = pd.DataFrame({
    'user_id': TEST_USERS,
    'item_id': pred_items.tolist()
}).explode('item_id')
hnsw_recos['rank'] = hnsw_recos.groupby('user_id').cumcount() + 1
hnsw_recos['item_id'] = dataset.item_id_map.convert_to_external(
    hnsw_recos['item_id']
)

hnsw_quality = {'model': 'MF + HNSW'}
hnsw_metrics = calc_metrics(metrics, hnsw_recos, test, train)
hnsw_quality.update(hnsw_metrics)

In [211]:
df_quality = pd.DataFrame([mf_quality, hnsw_quality]).T
df_quality.columns = df_quality.iloc[0]
df_quality.drop('model', inplace=True)
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,clear MF,MF + HNSW
Precision@1,0.075936,0.057375
Recall@1,0.039867,0.030132
Precision@4,0.055414,0.048309
Recall@4,0.109286,0.096593
Precision@7,0.040427,0.038014
Recall@7,0.135851,0.128765
Precision@10,0.031723,0.028452
Recall@10,0.151352,0.136605
MAP@1,0.039867,0.030132
MAP@4,0.068431,0.056272


## 4.3. HNSW Hyperparameters

Посмотрим, какой trade-off время/качетсво предлагает hnsw в нашем случае.

In [212]:
import time

In [215]:
def init_configs(
    M_1: int, efC_1: int, num_threads_1: int,
    method_1: str, space_name_1: str, data_type_1: nmslib.DataType,
    user_embeds_aug_1: np.array, item_embeds_aug_1: np.array
) -> Tuple[dict, dict, dict]:
    index_time_params_0 = {
        'M': M_1,
        'indexThreadQty': num_threads_1,
        'efConstruction': efC_1,
    }
    index_params_0 = {
        'method': method_1,
        'space_name': space_name_1,
        'data_type': data_type_1
    }
    embeddings_0 = {
        'user_embeds_aug': user_embeds_aug_1,
        'item_embeds_aug': item_embeds_aug_1,
    }
    return index_time_params_0, index_params_0, embeddings_0


def transform_hnsw_to_rectools_recs(
    preds: np.array, out_test_users: np.array
) -> pd.DataFrame:
    rect_recos = pd.DataFrame({
        'user_id': out_test_users,
        'item_id': preds.tolist()
    }).explode('item_id')
    rect_recos['rank'] = (
        rect_recos
        .groupby('user_id')
        .cumcount() + 1
    )
    rect_recos['item_id'] = dataset.item_id_map.convert_to_external(
        rect_recos['item_id']
    )
    return rect_recos


def make_hnsw_test(
    model_name: str, rect_recos: pd.DataFrame, metrics: dict,
    test: pd.DataFrame, train: pd.DataFrame
) -> dict:
    quality = {'model': model_name}
    metrics = calc_metrics(metrics, rect_recos, test, train)
    quality.update(metrics)
    return quality

In [216]:
# HNSW params:
M = range(10, 110, 10)
efC = range(20, 200, 15)
num_threads = 8

total = len(M)*len(efC)

method = 'hnsw'
space_name = 'negdotprod'
data_type = nmslib.DataType.DENSE_VECTOR

# Evaluate params:
test_embeds = user_embeds_aug[
    dataset.user_id_map.convert_to_internal(TEST_USERS)
]

# GridSearch:
hnsw_results = []
cnt = 0
for m in tqdm(M):
    for efc in efC:
        cnt += 1
        # init model:
        a, b, c = init_configs(
            M_1=m, efC_1=efc, num_threads_1=num_threads,
            method_1=method, space_name_1=space_name,
            data_type_1=data_type,
            user_embeds_aug_1=user_embeds_aug,
            item_embeds_aug_1=item_embeds_aug
        )
        hnsw_c = HNSW(a, b, c)
        # evaluate:
        start = time.time()
        pred_items, _ = hnsw_c.query(
            test_embeds, k=10, num_threads=8
        )
        end = time.time()
        rect_recs_c = transform_hnsw_to_rectools_recs(
            pred_items, TEST_USERS
        )
        model_name = f'hnsw_{m}_{efc}'
        qual_c = make_hnsw_test(
            model_name, rect_recs_c, metrics, test, train
        )
        sec = round(end - start, 3)
        qual_c['time sec'] = sec
        # save quality:
        hnsw_results.append(qual_c)
        with open('hnsw_results.dill_1', 'wb') as f:
            dill.dump(hnsw_results, f)
            
        # Stdout:
        print(
            f'{cnt}/{total} | {sec} sec |',
            f'MAP@10: {round(hnsw_results[-1]["MAP@10"], 3)}'
        )

  0%|                                                                             | 0/10 [00:00<?, ?it/s]

1/120 | 0.971 sec | MAP@10: 0.064
2/120 | 1.326 sec | MAP@10: 0.0
3/120 | 1.279 sec | MAP@10: 0.0
4/120 | 1.171 sec | MAP@10: 0.064
5/120 | 2.159 sec | MAP@10: 0.0
6/120 | 2.655 sec | MAP@10: 0.0
7/120 | 1.413 sec | MAP@10: 0.064
8/120 | 2.601 sec | MAP@10: 0.0
9/120 | 2.511 sec | MAP@10: 0.064
10/120 | 3.478 sec | MAP@10: 0.064
11/120 | 4.331 sec | MAP@10: 0.0


 10%|██████▉                                                              | 1/10 [00:53<07:58, 53.21s/it]

12/120 | 4.824 sec | MAP@10: 0.0
13/120 | 0.982 sec | MAP@10: 0.064
14/120 | 1.319 sec | MAP@10: 0.064
15/120 | 1.519 sec | MAP@10: 0.0
16/120 | 1.699 sec | MAP@10: 0.0
17/120 | 2.276 sec | MAP@10: 0.0
18/120 | 2.882 sec | MAP@10: 0.0
19/120 | 1.837 sec | MAP@10: 0.064
20/120 | 3.872 sec | MAP@10: 0.0
21/120 | 2.182 sec | MAP@10: 0.064
22/120 | 6.562 sec | MAP@10: 0.064
23/120 | 6.803 sec | MAP@10: 0.064


 20%|█████████████▊                                                       | 2/10 [02:09<08:54, 66.86s/it]

24/120 | 8.395 sec | MAP@10: 0.064
25/120 | 0.851 sec | MAP@10: 0.0
26/120 | 1.091 sec | MAP@10: 0.064
27/120 | 1.347 sec | MAP@10: 0.064
28/120 | 1.644 sec | MAP@10: 0.064
29/120 | 1.945 sec | MAP@10: 0.064
30/120 | 3.437 sec | MAP@10: 0.064
31/120 | 3.858 sec | MAP@10: 0.0
32/120 | 2.448 sec | MAP@10: 0.064
33/120 | 2.538 sec | MAP@10: 0.064
34/120 | 2.641 sec | MAP@10: 0.064
35/120 | 2.995 sec | MAP@10: 0.064


 30%|████████████████████▋                                                | 3/10 [03:52<09:44, 83.47s/it]

36/120 | 4.472 sec | MAP@10: 0.064
37/120 | 1.08 sec | MAP@10: 0.064
38/120 | 1.081 sec | MAP@10: 0.0
39/120 | 1.79 sec | MAP@10: 0.0
40/120 | 1.67 sec | MAP@10: 0.064
41/120 | 2.938 sec | MAP@10: 0.0
42/120 | 1.923 sec | MAP@10: 0.064
43/120 | 3.463 sec | MAP@10: 0.064
44/120 | 5.004 sec | MAP@10: 0.064
45/120 | 7.367 sec | MAP@10: 0.064
46/120 | 5.696 sec | MAP@10: 0.064
47/120 | 4.419 sec | MAP@10: 0.064


 40%|███████████████████████████▏                                        | 4/10 [06:27<11:08, 111.38s/it]

48/120 | 7.268 sec | MAP@10: 0.064
49/120 | 1.03 sec | MAP@10: 0.0
50/120 | 0.986 sec | MAP@10: 0.0
51/120 | 2.016 sec | MAP@10: 0.0
52/120 | 2.006 sec | MAP@10: 0.05
53/120 | 2.112 sec | MAP@10: 0.064
54/120 | 3.879 sec | MAP@10: 0.064
55/120 | 3.905 sec | MAP@10: 0.064
56/120 | 4.587 sec | MAP@10: 0.064
57/120 | 5.999 sec | MAP@10: 0.0
58/120 | 6.306 sec | MAP@10: 0.064
59/120 | 5.054 sec | MAP@10: 0.064


 50%|██████████████████████████████████                                  | 5/10 [10:22<13:00, 156.04s/it]

60/120 | 7.077 sec | MAP@10: 0.064
61/120 | 3.191 sec | MAP@10: 0.0
62/120 | 1.425 sec | MAP@10: 0.0
63/120 | 2.09 sec | MAP@10: 0.0
64/120 | 2.53 sec | MAP@10: 0.0
65/120 | 1.986 sec | MAP@10: 0.0
66/120 | 2.628 sec | MAP@10: 0.064
67/120 | 4.619 sec | MAP@10: 0.064
68/120 | 2.889 sec | MAP@10: 0.064
69/120 | 7.671 sec | MAP@10: 0.064
70/120 | 3.449 sec | MAP@10: 0.064
71/120 | 8.095 sec | MAP@10: 0.064


 60%|████████████████████████████████████████▊                           | 6/10 [15:48<14:15, 213.78s/it]

72/120 | 6.282 sec | MAP@10: 0.064
73/120 | 1.31 sec | MAP@10: 0.064
74/120 | 1.601 sec | MAP@10: 0.064
75/120 | 1.866 sec | MAP@10: 0.064
76/120 | 2.715 sec | MAP@10: 0.0
77/120 | 2.385 sec | MAP@10: 0.064
78/120 | 4.087 sec | MAP@10: 0.064
79/120 | 4.367 sec | MAP@10: 0.064
80/120 | 4.244 sec | MAP@10: 0.064
81/120 | 6.782 sec | MAP@10: 0.064
82/120 | 3.355 sec | MAP@10: 0.064
83/120 | 5.887 sec | MAP@10: 0.064


 70%|███████████████████████████████████████████████▌                    | 7/10 [24:02<15:16, 305.47s/it]

84/120 | 8.94 sec | MAP@10: 0.064
85/120 | 1.035 sec | MAP@10: 0.064
86/120 | 1.35 sec | MAP@10: 0.064
87/120 | 1.834 sec | MAP@10: 0.001
88/120 | 4.268 sec | MAP@10: 0.064
89/120 | 2.869 sec | MAP@10: 0.0
90/120 | 4.872 sec | MAP@10: 0.064
91/120 | 3.152 sec | MAP@10: 0.064
92/120 | 3.261 sec | MAP@10: 0.064
93/120 | 2.887 sec | MAP@10: 0.064
94/120 | 3.207 sec | MAP@10: 0.064
95/120 | 5.459 sec | MAP@10: 0.064


 80%|██████████████████████████████████████████████████████▍             | 8/10 [37:55<15:46, 473.41s/it]

96/120 | 5.296 sec | MAP@10: 0.064
97/120 | 0.716 sec | MAP@10: 0.0
98/120 | 1.328 sec | MAP@10: 0.064
99/120 | 1.992 sec | MAP@10: 0.064
100/120 | 4.348 sec | MAP@10: 0.064
101/120 | 2.615 sec | MAP@10: 0.001
102/120 | 5.6 sec | MAP@10: 0.064
103/120 | 5.484 sec | MAP@10: 0.0
104/120 | 2.953 sec | MAP@10: 0.064
105/120 | 3.483 sec | MAP@10: 0.064
106/120 | 3.693 sec | MAP@10: 0.064
107/120 | 5.859 sec | MAP@10: 0.064


 90%|█████████████████████████████████████████████████████████████▏      | 9/10 [56:31<11:14, 674.36s/it]

108/120 | 8.102 sec | MAP@10: 0.064
109/120 | 1.325 sec | MAP@10: 0.064
110/120 | 1.086 sec | MAP@10: 0.0
111/120 | 1.908 sec | MAP@10: 0.064
112/120 | 2.782 sec | MAP@10: 0.064
113/120 | 2.789 sec | MAP@10: 0.064
114/120 | 3.192 sec | MAP@10: 0.064
115/120 | 2.268 sec | MAP@10: 0.064
116/120 | 3.643 sec | MAP@10: 0.064
117/120 | 3.359 sec | MAP@10: 0.064
118/120 | 4.175 sec | MAP@10: 0.064
119/120 | 9.217 sec | MAP@10: 0.064


100%|█████████████████████████████████████████████████████████████████| 10/10 [1:23:26<00:00, 500.61s/it]

120/120 | 7.019 sec | MAP@10: 0.064





Судя по всему значения $MAP@10$ отличаются в незначимых числах после запятой. Тогда отберем наилучшее решение по времени работы.

In [217]:
max([r['MAP@10'] for r in hnsw_results])

0.0640108277103709

In [248]:
sorted_by_time = sorted(
    filter(lambda r: r['MAP@10'] > 0.06, hnsw_results),
    key=lambda x: x['time sec']
)
[
    (r['model'], r['time sec'], r['MAP@10']) for r in sorted_by_time
][:4]

[('hnsw_10_20', 0.971, 0.0640108277103709),
 ('hnsw_20_20', 0.982, 0.0640108277103709),
 ('hnsw_80_20', 1.035, 0.0640108277103709),
 ('hnsw_40_20', 1.08, 0.0640108277103709)]

In [102]:
a_best, b_best, c_best = init_configs(
    M_1=10, efC_1=20, num_threads_1=num_threads,
    method_1=method, space_name_1=space_name,
    data_type_1=data_type,
    user_embeds_aug_1=user_embeds_aug,
    item_embeds_aug_1=item_embeds_aug
)
hnsw_best = HNSW(a_best, b_best, c_best)

In [106]:
predict_items, _ = hnsw_best.query(
    user_embeds_aug,
    k=10,
    num_threads=4
)

predict_items_unique = np.unique(predict_items, axis=0)
if predict_items_unique.shape[0] == 1:
    unique_res = dataset.item_id_map.convert_to_external(
        predict_items_unique[0]
    )
else:
    unique_res = dataset.item_id_map.convert_to_external(
        np.unique(predict_items_unique)
    )
print(unique_res)

[ 9996 11237  5732  5658 14901  7571  1819  7793  9194 16228]


Рекомендуемые айтемы несколько изменились относительно популярного.

In [107]:
popular_items_all[:10]

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 142, 6809, 8636]

In [108]:
# Индексы рекомендуемых hnsw айтемов в популярном:
np.where(np.isin(popular_items_all, unique_res))[0]

array([10, 13, 14, 19, 30, 32, 34, 60, 73, 82])

# 5. Avatars

In [381]:
np.random.seed(3)

## 5.1. Avatar 1

Посмотрим, какие жанры фильмов самые не популярные и построим аватара на их основе.

Ожидаемые рекомендации от модели: фильмы того же жанра и как следствие - отсутствие переобучения на популярные жанры картин.

In [315]:
genre_counts = item_features[
    item_features['feature'] == 'genre'
].value_counts('value')

In [340]:
genre_counts.tail(40).iloc[:-20]

value
мировая классика       24
катастрофы             22
живая природа          20
вокруг света           19
дорамы                 18
медицинские            16
для самых маленьких    15
телешоу                14
короткий метр          13
стендап                12
увлечения              12
по комиксам             9
охота и рыбалка         9
ток-шоу                 9
футбол                  9
фильмы-спектакли        9
юмор                    7
популярное              7
единоборства            7
тележурналы             6
dtype: int64

К сожалнию видны ошибки в некоторых наименованиях, однако примем их как есть. Отберем не самые последние и тематически связанные, чтобы модели было что рекомендовать:

In [408]:
chosen_genre = ['живая природа', 'вокруг света', 'охота и рыбалка']

И сделаем пользователя примерно средним по количеству взаимодействий.

In [409]:
interactions.groupby('user_id').count()['item_id'].mean().round(3)

5.692

In [410]:
genre_features = item_features[item_features['feature'] == 'genre']
same_genre_items = genre_features[
    genre_features['value'].isin(chosen_genre)
]['id'].unique()
same_genre_items.shape[0]

38

Возьмем 7 фильмов:

In [411]:
avatar_1_items = np.random.choice(
    same_genre_items, 7, replace=False
)
avatar_1_items

array([ 9176, 13696,  8916,  4586, 13839, 12794,   229])

Построим интеракции с ними:

In [593]:
def make_avatar_interactions(
    avatar_id, avatar_items, train_columns
):
    interaction_points = ['total_dur', 'watched_pct']
    
    avatar_interactions = pd.DataFrame(np.stack([
        np.random.choice(train[ip], avatar_items.shape[0])
        for ip in interaction_points
    ])).T
    avatar_interactions['last_watch_dt'] = np.random.choice(
        train['last_watch_dt'], avatar_items.shape[0], replace=False
    )
    avatar_interactions.columns = (
        interaction_points + ['last_watch_dt']
    )
    avatar_interactions['weight'] = (
        np.where(avatar_interactions['watched_pct'] > 10, 3, 1)
    )
    
    avatar_interactions['item_id'] = avatar_items
    avatar_interactions['user_id'] = avatar_id
    avatar_interactions = avatar_interactions[train_columns]
    
    return avatar_interactions

In [596]:
avatar_1_id = users['user_id'].max() + 1
avatar_1_interactions = make_avatar_interactions(
    avatar_1_id, avatar_1_items, train.columns
)
avatar_1_interactions

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,1097559,9176,2021-08-02,14733.0,97.0,3
1,1097559,13696,2021-05-07,30570.0,64.0,3
2,1097559,8916,2021-05-01,12117.0,100.0,3
3,1097559,4586,2021-07-26,5920.0,100.0,3
4,1097559,13839,2021-06-29,320.0,94.0,3
5,1097559,12794,2021-05-28,47498.0,97.0,3
6,1097559,229,2021-07-14,5821.0,78.0,3


Сгенерируем пользователю средние признаки:

In [451]:
avatar_1_features = pd.DataFrame({
    'id': [avatar_1_id]*3,
    'value': ['age_35_44', 'Ж', 'income_40_60'],
    'feature': ['age', 'sex', 'income',]
})

avatar_1_features

Unnamed: 0,id,value,feature
0,1097559,age_35_44,age
1,1097559,Ж,sex
2,1097559,income_40_60,income


## 5.2. Avatar 2

Оценим способность модели строить рекомендации на основе признаков пользователя. Возьмем самую не популярную группу пользоавтелей по значениям признаков и добавим аватара с одним просмотром какой-нибудь картины, из этой группы.

Ожидаемые рекомендации от модели: популярные картины, среди выбранной группы.

In [540]:
min_feature_values = (
    user_features[
    ~(user_features['value'] == 'Unknown')
].groupby(['feature', 'value']).count()
.reset_index().groupby('feature').min()
)
min_feature_values

Unnamed: 0_level_0,value,id
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
age,age_18_24,31750
income,income_0_20,832
sex,Ж,276357


In [571]:
min_users_group = user_features[
    user_features['value'].isin(min_feature_values['value'])
]['id'].unique()

random_user = np.random.choice(min_users_group, 1)
print(f'random_user: {random_user}')

random_item = np.random.choice(
        train[train['user_id'] == random_user[0]]['item_id'], 1
    )
print(f'random_item: {random_item}')

random_user: [259793]
random_item: [13865]


In [590]:
min_group_popular = train[
    train['user_id'].isin(min_users_group)
]['item_id'].value_counts().index.tolist()

min_group_popular[:10]

[15297, 10440, 9728, 13865, 4151, 3734, 2657, 4880, 142, 8636]

Создадим аватару взаимодействия и признаки:

In [602]:
avatar_2_id = avatar_1_id + 1
avatar_2_interactions = make_avatar_interactions(
    avatar_2_id, random_item, train.columns
)
avatar_2_interactions

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,1097560,13865,2021-06-09,508.0,23.0,3


In [609]:
avatar_2_features = pd.DataFrame({
    'id': [avatar_2_id]*3,
    'value': min_feature_values.loc[
        ['age', 'sex', 'income'], 'value'
    ].values,
    'feature': ['age', 'sex', 'income',]
})

avatar_2_features

Unnamed: 0,id,value,feature
0,1097560,age_18_24,age
1,1097560,Ж,sex
2,1097560,income_0_20,income


## 5.3. Avatar 3

Рассмотрим пятый по популярности жанр и построим аватар с большим количеством просмотров этого жанра.

Ожидаемые рекомендации модели: фильмы того же жанра.

In [614]:
genre_counts.head()

value
драмы         4923
комедии       3479
зарубежные    3055
мелодрамы     2533
триллеры      2297
dtype: int64

In [623]:
triller_genre_items = genre_features[
    genre_features['value'] == 'триллеры'
]['id'].unique()

avatar_3_items = np.random.choice(
    triller_genre_items, 25, replace=False
)
avatar_3_items

array([ 2440, 10657,  5755,  6067,  3790,  6781,  8283, 15968,  9726,
        2950,  5809,  2032, 12259, 11735, 11117, 15306, 12114,   719,
        8226,  2408, 12637,  3136, 11264,  4964, 15384])

Создадим взаимодействия аватара и средние признаки:

In [625]:
avatar_3_id = avatar_2_id + 1
avatar_3_interactions = make_avatar_interactions(
    avatar_3_id, avatar_3_items, train.columns
)

print(avatar_3_interactions.shape)
avatar_3_interactions.head()

(25, 6)


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,1097561,2440,2021-07-05,8334.0,16.0,3
1,1097561,10657,2021-06-27,1126.0,39.0,3
2,1097561,5755,2021-07-18,1294.0,100.0,3
3,1097561,6067,2021-07-05,8625.0,100.0,3
4,1097561,3790,2021-06-02,4600.0,100.0,3


In [626]:
avatar_3_features = pd.DataFrame({
    'id': [avatar_3_id]*3,
    'value': ['age_35_44', 'Ж', 'income_40_60'],
    'feature': ['age', 'sex', 'income',]
})

avatar_3_features

Unnamed: 0,id,value,feature
0,1097561,age_35_44,age
1,1097561,Ж,sex
2,1097561,income_40_60,income


## 5.4. Collect and analize avatars

In [627]:
avatars = {
    'avatar_1': (avatar_1_interactions, avatar_1_features),
    'avatar_2': (avatar_2_interactions, avatar_2_features),
    'avatar_3': (avatar_3_interactions, avatar_3_features),
}

for av_name, av_features in avatars.items():
    train = pd.concat([train, av_features[0]])
    user_features = pd.concat([user_features, av_features[1]])

### Fit model with new users

In [630]:
dataset_avatars = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [631]:
MF_final_model_avatars = LightFMWrapperModel(
    LightFM(
        no_components=12,
        loss='warp',
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        user_alpha=0.01,
        item_alpha=0.01,
    ),
    epochs=3,
    num_threads=NUM_THREADS,
)

In [632]:
MF_final_model_avatars.fit(dataset_avatars)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f269a4febb0>

In [635]:
mf_recos_avs = MF_final_model_avatars.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

map_metric_avs = calc_metrics(
    metrics, mf_recos_avs, test, train
)['MAP@10']
map_metric_avs

0.07407306029738925

### Check by source model

In [660]:
mf_recos_avs_users = MF_final_model_avatars.recommend(
    users=np.array([avatar_1_id, avatar_2_id, avatar_3_id]),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [665]:
mf_recos_avs_users.head(3)

Unnamed: 0,user_id,item_id,score,rank
0,1097559,15297,-0.117178,1
1,1097559,10440,-0.117464,2
2,1097559,13865,-0.117654,3


#### Avatar 1

Посмотрим на жанры рекомендованных фильмов:

In [671]:
av_1_rec_model_items = mf_recos_avs_users[
    mf_recos_avs_users['user_id'] == avatar_1_id
]['item_id']

In [686]:
item_features[
    item_features['id'].isin(av_1_rec_model_items)
][
    item_features[
        item_features['id'].isin(av_1_rec_model_items)
    ]['value'].isin(chosen_genre)
]

Unnamed: 0,id,value,feature


Ни один из рекомендованных фильмов не попал в искомый жанр.

#### Avatar 2

Посмотрим, на каких местах стоят рекомендованные айтемы:

In [688]:
av_2_rec_model_items = mf_recos_avs_users[
    mf_recos_avs_users['user_id'] == avatar_2_id
]['item_id']

In [689]:
np.where(np.isin(min_group_popular, av_2_rec_model_items))

(array([ 0,  1,  2,  4,  5,  6,  7,  8, 10, 18]),)

Почти все айтемы рекомендованы и упорядочены в соответсвии с предполагаемым набором рекомендаций.

#### Avatar 3

Изучим предлагаемые рекомендательной системой жанры:

In [695]:
av_3_rec_model_items = mf_recos_avs_users[
    mf_recos_avs_users['user_id'] == avatar_3_id
]['item_id']

In [697]:
item_features[
    item_features['id'].isin(av_3_rec_model_items)
][
    item_features[
        item_features['id'].isin(av_3_rec_model_items)
    ]['value'].isin(chosen_genre)
]

Unnamed: 0,id,value,feature


Ни один из рекомендованных фильмов не попал в искомый жанр.

### Conclusion

Отметим следующее.

1. Модель имеет сдвиг на популярные айтемы, что было выявлено ранее. Это как минимум означает, что не популярные айтемы/жанры айтемов вряд ли попадут в рекомендованные.

2. Выбраная категория людей для второго аватара смотрит популярное не меньше остальных, поэтому популярное в этой группе не отличается от популярного по всем пользователям. Это лишний раз говорит о том, что моделью рекомендуется скорее популярное.

# 6. Cold users

Холодным пользвателям будем рекомендовать популярное.

# 7. Optimize HNSW model implementation for one user

In [124]:
import numpy as np
import nmslib
import typing as tp


class HNSWModel:
    def __init__(
        self,
        index_time_params: tp.Dict[str, int],
        index_params: tp.Dict[str, tp.Union[int, nmslib.DataType]],
        embeddings: tp.Dict[str, np.array],
        maps: tp.Dict[str, Dict[int, int]],
        pops: List[int],
    ):
        # Params:
        self.index_time_params = index_time_params
        self.index_params = index_params
        # Embeddings:
        self.embeddings = embeddings
        # Index:
        self.index = nmslib.init(
            method = index_params['method'],
            space = index_params['space_name'],
            data_type = index_params['data_type']
        )
        self.index.addDataPointBatch(embeddings['item_embeds_aug'])
        self.index.createIndex(index_time_params)
        # Query params:
        self.index.setQueryTimeParams(
            {'efSearch': index_time_params['efConstruction']}
        )
        # Maps:
        self.user_map = maps['user_map']
        self.user_map_inv = {
            v: k for k, v in maps['user_map'].items()
        }
        self.item_map = maps['item_map']
        self.item_map_inv = {
            v: k for k, v in maps['item_map'].items()
        }
        # Popular items:
        self.pops = pops
    
    def predict(self, user_id: int, k: int = 10) -> List[int]:
        in_user_id = self.user_map.get(user_id)
        print(f'in_user_id: {in_user_id}')
        if in_user_id is None:
            return self.pops[:k]
        else:
            query = self.embeddings['user_embeds_aug'][
                in_user_id
            ]
            pred_items, _ = self.index.knnQuery(query, k=k)
            return list(map(self.item_map_inv.get, pred_items))
    
    
def init_configs(
    M_1: int, efC_1: int, num_threads_1: int,
    method_1: str, space_name_1: str, data_type_1: nmslib.DataType,
    user_embeds_aug_1: np.array, item_embeds_aug_1: np.array
) -> Tuple[
    tp.Dict[str, int],
    tp.Dict[str, tp.Union[int, nmslib.DataType]],
    tp.Dict[str, np.array]
]:
    index_time_params_0 = {
        'M': M_1,
        'indexThreadQty': num_threads_1,
        'efConstruction': efC_1,
    }
    index_params_0 = {
        'method': method_1,
        'space_name': space_name_1,
        'data_type': data_type_1
    }
    embeddings_0 = {
        'user_embeds_aug': user_embeds_aug_1,
        'item_embeds_aug': item_embeds_aug_1,
    }
    return index_time_params_0, index_params_0, embeddings_0

In [125]:
# Prepare embeddings:
user_embeds, item_embeds = MF_final_model.get_vectors(dataset)

print(f'user embeds: {user_embeds.shape}')
print(f'item embeds: {item_embeds.shape}\n')


user_embeds_aug, item_embeds_aug = augments_embeds(
    user_embeds,
    item_embeds
)

print(f'user embeds aug: {user_embeds_aug.shape}')
print(f'item embeds aug: {item_embeds_aug.shape}')

user embeds: (756562, 14)
item embeds: (14019, 14)

user embeds aug: (756562, 15)
item embeds aug: (14019, 15)


In [126]:
# Save embeddings:
np.save('user_embeds_aug.npy', user_embeds_aug)
np.save('item_embeds_aug.npy', item_embeds_aug)

In [127]:
# Load embeddings:
user_embeds_aug = np.load('user_embeds_aug.npy')
item_embeds_aug = np.load('item_embeds_aug.npy')

print(f'user embeds aug: {user_embeds_aug.shape}')
print(f'item embeds aug: {item_embeds_aug.shape}')

user embeds aug: (756562, 15)
item embeds aug: (14019, 15)


In [128]:
# Users/Items maps:
items_ext = train['item_id'].unique()
items_int = dataset.item_id_map.convert_to_internal(items_ext)
items_map = {
    it_ex: it_in for it_ex, it_in in zip(items_ext, items_int)
}

users_ext = train['user_id'].unique()
users_int = dataset.user_id_map.convert_to_internal(users_ext)
users_map = {
    us_ex: us_in for us_ex, us_in in zip(users_ext, users_int)
}

maps = {
    'user_map': users_map,
    'item_map': items_map
}

In [129]:
# Save popular:
np.save('popular_items.npy', popular_items_all)

In [130]:
# Load popular:
popular_items_all = np.load('popular_items.npy').tolist()
popular_items_all[:10]

[10440, 15297, 13865, 9728, 4151, 3734, 2657, 142, 6809, 8636]

In [131]:
a_best, b_best, c_best = init_configs(
    M_1=10, efC_1=20, num_threads_1=8,
    method_1='hnsw', space_name_1='negdotprod',
    data_type_1=nmslib.DataType.DENSE_VECTOR,
    user_embeds_aug_1=user_embeds_aug,
    item_embeds_aug_1=item_embeds_aug
)

hnsw_best = HNSWModel(
    a_best, b_best, c_best, maps, popular_items_all
)

In [132]:
test['user_id'].unique()

array([203219, 200197,  73446, ..., 623792, 442859, 857162])

In [133]:
hnsw_best.predict(73446, k=10)

in_user_id: 65114


[2657, 9996, 5732, 5658, 7417, 7829, 14901, 7571, 1819, 7793]

In [146]:
hnsw_best.predict(623792, k=10)

in_user_id: 561844


[2657, 9996, 5732, 5658, 7417, 7829, 14901, 7571, 1819, 7793]

In [147]:
cold_user = -1
hnsw_best.predict(cold_user, k=10)

in_user_id: None


[10440, 15297, 13865, 9728, 4151, 3734, 2657, 142, 6809, 8636]