In [57]:
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
# from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

### read data

In [58]:
data_df = pd.read_csv('data/Reviews.csv')

In [59]:
data_df = data_df.groupby(['UserId', 'ProductId']).first()

In [60]:
scores_df = data_df['Score']

In [61]:
scores_df = scores_df.reset_index()

In [62]:
users = scores_df['UserId']
items = scores_df['ProductId']

In [63]:
users = users.astype('category')
items = items.astype('category')

In [64]:
scores = scores_df['Score']
scores = scores.astype('int32')

### create user / item sparse matrix

In [65]:
data = scores.to_numpy()
users_idx = users.cat.codes.to_numpy()
items_idx = items.cat.codes.to_numpy()

In [66]:
data.shape, users_idx.shape, items_idx.shape

((560804,), (560804,), (560804,))

In [67]:
users_num = users.cat.categories.shape[0]
items_num = items.cat.categories.shape[0]

In [68]:
users_num, items_num

(256059, 74258)

In [69]:
sparse_data = csr_matrix((data, (users_idx, items_idx)),
                         shape=(users_num, items_num))

In [70]:
sparse_data

<256059x74258 sparse matrix of type '<class 'numpy.int32'>'
	with 560804 stored elements in Compressed Sparse Row format>

### apply nearest neighbor search using cosine distance

In [71]:
seed = 1

In [72]:
train, test = train_test_split(sparse_data, test_size=0.1,
                               random_state=seed, shuffle=True)

### reduce dimensions with SVD

In [73]:
svd = TruncatedSVD(n_components=10, random_state=seed)

In [74]:
train_reduced = svd.fit_transform(train)

In [75]:
train.shape

(230453, 74258)

In [76]:
train_reduced.shape

(230453, 10)

In [77]:
neigh = NearestNeighbors(algorithm='ball_tree', metric=cosine, n_jobs=-1)

In [78]:
neigh.fit(train_reduced)

NearestNeighbors(algorithm='ball_tree',
                 metric=<function cosine at 0x7f4872c9a598>, n_jobs=-1)

In [79]:
test_idx = 0
test_sample = train_reduced[test_idx].reshape(1, -1)
dist, idx = neigh.kneighbors(test_sample, n_neighbors=10)

In [82]:
def get_scores(query, n=100, sparse=train):
    distance, indices = neigh.kneighbors(query, n_neighbors=n)
    distance = distance[:, :, None]
    similarity = 1 - distance
    neighbors = []
    for row in indices:
        row_data = sparse[row].toarray()
        neighbors.append(row_data)
    neighbors = np.stack(neighbors, axis=0)
    result = (neighbors * similarity).sum(axis=1)
    been_scored = np.where(neighbors != 0, 1, 0)
    norm = (been_scored * similarity).sum(axis=1)
    norm[norm == 0] = 1
    result = result / norm
    return result

test_scores = get_scores(train_reduced[[0, 1]])

In [83]:
test_scores

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
test_scores.min(), test_scores.max()

(0.0, 5.000000000000001)

In [85]:
test_scores.shape

(2, 74258)