In [1]:
import time

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

### read data

In [2]:
data_df = pd.read_csv('data/Reviews.csv')

In [3]:
data_df = data_df.groupby(['UserId', 'ProductId']).first()

In [4]:
scores_df = data_df['Score']

In [5]:
scores_df = scores_df.reset_index()

In [6]:
users = scores_df['UserId']
items = scores_df['ProductId']

In [7]:
users = users.astype('category')
items = items.astype('category')

In [8]:
scores = scores_df['Score']
scores = scores.astype('int32')

### create user / item sparse matrix

In [9]:
data = scores.to_numpy()
users_idx = users.cat.codes.to_numpy()
items_idx = items.cat.codes.to_numpy()

In [10]:
data.shape, users_idx.shape, items_idx.shape

((560804,), (560804,), (560804,))

In [11]:
users_num = users.cat.categories.shape[0]
items_num = items.cat.categories.shape[0]

In [12]:
users_num, items_num

(256059, 74258)

In [13]:
sparse_data = csr_matrix((data, (users_idx, items_idx)), shape=(users_num, items_num))

In [14]:
sparse_data

<256059x74258 sparse matrix of type '<class 'numpy.int32'>'
	with 560804 stored elements in Compressed Sparse Row format>

### apply nearest neighbor search using cosine distance

In [15]:
seed = 1

In [16]:
train, test = train_test_split(sparse_data, test_size=0.1, random_state=seed, shuffle=True)

In [17]:
neigh = NearestNeighbors(n_neighbors=3, radius=1, metric=cosine_distances, n_jobs=-1)

In [18]:
neigh.fit(train)

NearestNeighbors(metric=<function cosine_distances at 0x7f049c6a9840>,
                 n_jobs=-1, n_neighbors=3, radius=1)

In [19]:
cosine_distances(train[0], train[1])

array([[1.]])

In [20]:
start = time.time()

sample_idx = np.random.randint(0, train.shape[0])
sample = train[sample_idx]
neigh.kneighbors(sample, n_neighbors=3)

print('searching for one sample took {:.3f} seconds'.format(time.time() - start))

searching for one sample took 212.204 seconds
