In [1]:
import os

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PureSVDModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [5]:
%cd ../..

/home/kirill/PycharmProjects/pythonProject


In [6]:
DATA_PATH = Path("data/kion_train")

# LOAD DATA 

In [7]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.18 s, sys: 882 ms, total: 3.06 s
Wall time: 3.33 s


# Preprocess

In [8]:
Columns.Datetime = 'last_watch_dt'

In [9]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [10]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [11]:
max_date = interactions[Columns.Datetime].max()

In [12]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [13]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [14]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [15]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [16]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [17]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [18]:
users.fillna('Unknown', inplace=True)

In [19]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [20]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [21]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [22]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [23]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [24]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [25]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [26]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [27]:
items.nunique()

item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

### Genre

In [28]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [29]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [30]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [31]:
item_features = pd.concat((genre_feature, content_feature))

In [32]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [33]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [34]:
metrics

{'Precision@1': Precision(k=1),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

# Model

In [35]:
# # The best model ALS

# K_RECOS = 10
# RANDOM_STATE = 42
# NUM_THREADS = 16
# n_factors = (32)
# iterations = (20)

# dataset = Dataset.construct(
#     interactions_df=train,
#     user_features_df=user_features,
#     cat_user_features=["sex", "age", "income"],
#     item_features_df=item_features,
#     cat_item_features=["genre", "content_type"],
# )

# model = ImplicitALSWrapperModel(
#                 model=AlternatingLeastSquares(
#                     factors=n_factors, 
#                     random_state=RANDOM_STATE, 
#                     num_threads=NUM_THREADS,
#                     iterations=iterations,
#                 ))

# model.fit(dataset)
# recos = model.recommend(
#     users=test[Columns.User].unique(),
#     dataset=dataset,
#     k=K_RECOS,
#     filter_viewed=True,
# )

In [38]:
# # The best model SVD

# K_RECOS = 10
# RANDOM_STATE = 42
# NUM_THREADS = 16
# N_FACTORS = (16,) 

# dataset = Dataset.construct(
#     interactions_df=train,
#     user_features_df=user_features,
#     cat_user_features=["sex", "age", "income"],
#     item_features_df=item_features,
#     cat_item_features=["genre", "content_type"],
# )

# models = {
#     'pure_svd': PureSVDModel(16,)
# }

# results = []
# for model_name, model in models.items():
#     print(f"Fitting model {model_name}...")
#     model_quality = {'model': model_name}

#     model.fit(dataset)
#     recos = model.recommend(
#         users=test[Columns.User].unique(),
#         dataset=dataset,
#         k=K_RECOS,
#         filter_viewed=True,
#     )
#     metric_values = calc_metrics(metrics, recos, test, train)
#     model_quality.update(metric_values)
#     results.append(model_quality)

Fitting model pure_svd...


In [37]:
# The best model LightFM
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32, 64, 128, 160)
ITERATIONS = (15, 20)

best_components = 17
best_loss = 'logistic'
best_rho = 0.93
best_lr = 0.05

epsilon = 3

dataset = Dataset.construct(
        interactions_df=train
    )

lightfm = LightFMWrapperModel(
        model = LightFM(
            no_components = best_components,
            learning_schedule = 'adadelta',
            loss = best_loss,
            rho = best_rho,
            epsilon = epsilon,
            learning_rate = best_lr,
            random_state = RANDOM_STATE
        )
    )

lightfm.fit(dataset)

recs = lightfm.recommend(
    users=test[Columns.User].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

metric_values = calc_metrics(metrics, recs, test, train)
model = lightfm

# Approximate Nearest Neighbors 

In [38]:
import nmslib

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


# HNSW algorithm parameters

## Search parameters:
* ```ef``` - the size of the dynamic list for the nearest neighbors (used during the search). Higher ```ef```
leads to more accurate but slower search. ```ef``` cannot be set lower than the number of queried nearest neighbors
```k```. The value ```ef``` of can be anything between ```k``` and the size of the dataset.
* ```k``` number of nearest neighbors to be returned as the result.
The ```knn_query``` function returns two numpy arrays, containing labels and distances to the k found nearest 
elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
(this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.


## Construction parameters:
* ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M``` 
is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work 
better for datasets with low intrinsic dimensionality and/or low recalls. The parameter also determines the algorithm's memory 
consumption, which is roughly ```M * 8-10``` bytes per stored element.  
As an example for ```dim```=4 random vectors optimal ```M``` for search is somewhere around 6, while for high dimensional datasets 
(word embeddings, good face descriptors), higher ```M``` are required (e.g. ```M```=48-64) for optimal performance at high recall. 
The range ```M```=12-48 is ok for the most of the use cases. When ```M``` is changed one has to update the other parameters. 
Nonetheless, ef and ef_construction parameters can be roughly estimated by assuming that ```M```*```ef_{construction}``` is 
a constant.

* ```ef_construction``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger 
ef_construction leads to longer construction, but better index quality. At some point, increasing ef_construction does
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall 
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room 
for improvement.
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
function has a parameter which defines the new maximum number of elements).

Еще источники: 
- [Nmslib Docs](https://github.com/nmslib/nmslib/blob/master/manual/methods.md)
- [Pinecone Vector Indexes](https://www.pinecone.io/learn/vector-indexes/)

<img src="https://d33wubrfki0l68.cloudfront.net/4c635fabb268a4af60109a506300a2dfda612063/d2535/images/similarity-search-indexes17.jpg">

<img src="https://d33wubrfki0l68.cloudfront.net/96d80cd46c2d12df99c044c860a8a5fb00cf6376/d59ca/images/similarity-search-indexes18.jpg">

In [39]:
import time

In [44]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [45]:
user_embeddings.shape, item_embeddings.shape

((756562, 19), (14019, 19))

In [46]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [47]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (14019, 19)


(14019, 20)

In [48]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 20)

In [49]:
user_id = 30

In [50]:
user_embeddings[user_id]

array([ 3.34024429e-04,  1.00000000e+00,  1.72437131e-02, -3.63814714e-03,
        2.34629884e-02,  1.16966479e-02, -1.20902592e-02, -2.54115220e-02,
       -1.95118245e-02, -2.26247404e-02,  2.43350677e-02, -9.15376283e-03,
       -9.57392901e-03, -9.82469693e-03, -1.68472081e-02,  2.71503422e-02,
       -4.57300618e-03,  2.43195221e-02,  6.20624702e-03])

In [51]:
augmented_user_embeddings[user_id]

array([ 3.34024429e-04,  1.00000000e+00,  1.72437131e-02, -3.63814714e-03,
        2.34629884e-02,  1.16966479e-02, -1.20902592e-02, -2.54115220e-02,
       -1.95118245e-02, -2.26247404e-02,  2.43350677e-02, -9.15376283e-03,
       -9.57392901e-03, -9.82469693e-03, -1.68472081e-02,  2.71503422e-02,
       -4.57300618e-03,  2.43195221e-02,  6.20624702e-03,  0.00000000e+00])

In [52]:
item_id = 0

In [53]:
item_embeddings[item_id]

array([ 1.00000000e+00,  3.15215921e+00,  1.13837095e-03,  1.29325734e-02,
       -1.63832319e-03,  1.71538424e-02, -2.94670314e-02,  6.39237044e-03,
       -2.77512819e-02,  2.66506206e-02, -4.75359568e-03,  6.88226447e-02,
       -1.97166558e-02,  1.80595256e-02,  3.84662161e-03, -4.04029712e-02,
       -2.10334267e-02, -1.67550743e-02, -2.30941288e-02])

In [54]:
augmented_item_embeddings[item_id]

array([ 1.00000000e+00,  3.15215921e+00,  1.13837095e-03,  1.29325734e-02,
       -1.63832319e-03,  1.71538424e-02, -2.94670314e-02,  6.39237044e-03,
       -2.77512819e-02,  2.66506206e-02, -4.75359568e-03,  6.88226447e-02,
       -1.97166558e-02,  1.80595256e-02,  3.84662161e-03, -4.04029712e-02,
       -2.10334267e-02, -1.67550743e-02, -2.30941288e-02,  1.26227529e+01])

In [55]:
# Set index parameters
# These are the most important ones
M = 64
efC = 128

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 128, 'post': 0}


In [56]:
# Number of neighbors 
K=10

In [57]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'

In [58]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14019

In [59]:
index

<nmslib.FloatIndex method='hnsw' space='negdotprod' at 0x83b4920>

In [60]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 128}
Indexing time = 0.214294


In [61]:
# Setting query-time parameters
efS = 128
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 128}


In [62]:
query_matrix = augmented_user_embeddings

In [63]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=2.474607 (sec), per query=0.000003 (sec), per query adjusted for thread number=0.000013 (sec)


In [64]:
nbrs[0]

(array([12982,  8867, 11758,  8287,  3527,  3174,  2256,   118,  5797,
         7342], dtype=int32),
 array([-13.007258, -12.891575, -12.498563, -12.431263, -12.182217,
        -12.041499, -11.803938, -11.451096, -11.33427 , -11.16776 ],
       dtype=float32))

In [65]:
nbrs[0][1]

array([-13.007258, -12.891575, -12.498563, -12.431263, -12.182217,
       -12.041499, -11.803938, -11.451096, -11.33427 , -11.16776 ],
      dtype=float32)

In [66]:
def recommend_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpartition_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpartition_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpartition_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    distances = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distances

In [67]:
recommend_all(user_embeddings[[0], :], item_embeddings)

(array([[12982,  8867, 11758,  8287,  3527,  3174,  2256,   118,  5797,
          7342]]),
 array([[13.00725893, 12.89157521, 12.49856294, 12.4312617 , 12.18221654,
         12.04149928, 11.80393829, 11.45109424, 11.33427053, 11.16776066]]))

In [68]:
item_embeddings[:1000, :].shape, user_embeddings.shape

((1000, 19), (756562, 19))

In [69]:
query_matrix_not_augmented = user_embeddings[:1000, :]

In [70]:
%%timeit
recommend_all(query_matrix_not_augmented, item_embeddings)

166 ms ± 7.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [71]:
%%timeit
index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

2.66 s ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [72]:
import hnswlib

In [73]:
%%time
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim) # possible options for space are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
hnsw.init_index(max_elements, M, efC)

# Element insertion (can be called several times)
hnsw.add_items(augmented_item_embeddings)

CPU times: user 1.1 s, sys: 12.3 ms, total: 1.11 s
Wall time: 126 ms


In [74]:
# Controlling the recall by setting ef, should always be > k
hnsw.set_ef(efS)

In [75]:
label, distance = hnsw.knn_query(query_matrix, k=k)

In [76]:
label

array([[12982,  8867, 11758, ...,   118,  5797,  7342],
       [12982,  8867, 11758, ...,   118,  5797,  7342],
       [12982,  8867, 11758, ...,   118,  5797,  7342],
       ...,
       [12982,  8867, 11758, ...,   118,  5797,  7342],
       [12982,  8867, 11758, ...,   118,  5797,  7342],
       [12982,  8867, 11758, ...,   118,  5797,  7342]], dtype=uint64)

In [77]:
1 - distance

array([[13.007259 , 12.891575 , 12.498563 , ..., 11.451096 , 11.3342705,
        11.16776  ],
       [13.006828 , 12.893085 , 12.495227 , ..., 11.453471 , 11.335853 ,
        11.168996 ],
       [13.375364 , 13.257078 , 12.868646 , ..., 11.819419 , 11.701473 ,
        11.5361805],
       ...,
       [13.091685 , 12.970072 , 12.577861 , ..., 11.533123 , 11.417131 ,
        11.2504835],
       [13.010241 , 12.898027 , 12.496975 , ..., 11.45405  , 11.337967 ,
        11.170005 ],
       [13.007491 , 12.888544 , 12.498455 , ..., 11.454079 , 11.336102 ,
        11.16612  ]], dtype=float32)

In [78]:
item_embeddings[8867].dot(user_embeddings[0])

12.891575207435197

In [79]:
labels, distances = recommend_all(user_embeddings[:1000, :], item_embeddings)
print(labels)
print(distances)

[[12982  8867 11758 ...   118  5797  7342]
 [12982  8867 11758 ...   118  5797  7342]
 [12982  8867 11758 ...   118  5797  7342]
 ...
 [12982  8867 11758 ...   118  5797  7342]
 [12982  8867 11758 ...   118  5797  7342]
 [12982  8867 11758 ...   118  5797  7342]]
[[13.00725893 12.89157521 12.49856294 ... 11.45109424 11.33427053
  11.16776066]
 [13.00682883 12.89308358 12.49522747 ... 11.4534712  11.33585224
  11.16899535]
 [13.3753637  13.25707789 12.86864591 ... 11.81941864 11.70147368
  11.53618111]
 ...
 [15.44406502 15.32638096 14.92854583 ... 13.89176365 13.77554153
  13.60952722]
 [13.02805209 12.90644352 12.51850007 ... 11.46619781 11.3524233
  11.18402185]
 [13.01530207 12.89657384 12.50356231 ... 11.45609951 11.34150721
  11.17275412]]


In [80]:
# Recommendations for a user
user_id = 0
label[user_id]

array([12982,  8867, 11758,  8287,  3527,  3174,  2256,   118,  5797,
        7342], dtype=uint64)

In [81]:
# Save item_embeddings and user_embeddings
import joblib

joblib.dump(item_embeddings, 'data/offline_reco_df/ANN_item_embeddings.sav')
joblib.dump(user_embeddings, 'data/offline_reco_df/ANN_user_embeddings.sav')

['data/offline_reco_df/ANN_user_embeddings.sav']

In [82]:
joblib.dump(label, 'data/offline_reco_df/ANN_label.sav')
joblib.dump(distance, 'data/offline_reco_df/ANN_distance.sav')

['data/offline_reco_df/ANN_distance.sav']