In [1]:
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

### read data

In [2]:
data_df = pd.read_csv('data/Reviews.csv')

In [3]:
data_df = data_df.groupby(['UserId', 'ProductId']).first()

In [4]:
scores_df = data_df['Score']

In [5]:
scores_df = scores_df.reset_index()

In [6]:
users = scores_df['UserId']
items = scores_df['ProductId']

In [7]:
users = users.astype('category')
items = items.astype('category')

In [8]:
scores = scores_df['Score']
scores = scores.astype('int32')

### create user / item sparse matrix

In [9]:
data = scores.to_numpy()
users_idx = users.cat.codes.to_numpy()
items_idx = items.cat.codes.to_numpy()

In [10]:
data.shape, users_idx.shape, items_idx.shape

((560804,), (560804,), (560804,))

In [11]:
users_num = users.cat.categories.shape[0]
items_num = items.cat.categories.shape[0]

In [12]:
users_num, items_num

(256059, 74258)

In [13]:
sparse_data = csr_matrix((data, (users_idx, items_idx)),
                         shape=(users_num, items_num))

In [14]:
sparse_data

<256059x74258 sparse matrix of type '<class 'numpy.int32'>'
	with 560804 stored elements in Compressed Sparse Row format>

### apply nearest neighbor search using cosine distance

In [15]:
seed = 1

In [16]:
train, test = train_test_split(sparse_data, test_size=0.1,
                               random_state=seed, shuffle=True)

### reduce dimensions with SVD

In [17]:
svd = TruncatedSVD(n_components=10, random_state=seed)

In [18]:
train_reduced = svd.fit_transform(train)

In [19]:
train.shape

(230453, 74258)

In [20]:
train_reduced.shape

(230453, 10)

In [21]:
neigh = NearestNeighbors(algorithm='ball_tree', metric=cosine, n_jobs=-1)

In [22]:
neigh.fit(train_reduced)

NearestNeighbors(algorithm='ball_tree',
                 metric=<function cosine at 0x7f6d0b7fc598>, n_jobs=-1)

In [23]:
def get_scores(query, nn_object=neigh, n=10, sparse=train):
    """
    Retrieve n nearest neighbors from sparse matrix and calculate
    scores from averaged nonzero elements
    :param query: rows from SVD-reduced array
    :param nn_object: NearestNeighbor search obj precomputed on
    SVD-reduced array
    :param n: number of neighbors to retrieve
    :param sparse: sparse matrix containing user scores
    :return: averaged scores
    """
    distance, indices = nn_object.kneighbors(query, n_neighbors=n)
    distance = distance[:, :, None]
    similarity = 1 - distance
    neighbors = []
    for row in indices:
        row_data = sparse[row].toarray()
        neighbors.append(row_data)
    neighbors = np.stack(neighbors, axis=0)  # shape: [samples, users, items]
    result = (neighbors * similarity).sum(axis=1)
    been_scored = np.where(neighbors != 0, 1, 0)
    norm = (been_scored * similarity).sum(axis=1)
    norm[norm == 0] = 1
    result = result / norm
    result = np.clip(result, 0, 1)  # account for float errors
    return result

In [51]:
def rmse(predicted, sparse=test):
    dense = sparse.toarray()
    se = np.power(predicted - dense, 2).sum()
    nonzero_num = (dense != 0).astype('int').sum()
    return np.power(se / nonzero_num, 0.5)

In [25]:
test_reduced = svd.transform(test)

In [31]:
test_reduced.shape

(25606, 10)

In [None]:
test_pred = get_scores(test_reduced[:100])

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [None]:
error = rmse(test_pred, test[:100])

### processing takes too long to calculate mse on test dataset
### skipping for now

In [None]:
def suggest_items(user_id,
                  n_items=3,
                  user_list=users.cat.categories,
                  item_list=items.cat.categories,
                  nn_object=neigh,
                  reduced=test_reduced,
                  sparse=test):