In [1]:
import time
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

In [2]:
%%time
dataset = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', names=["user_id", "item_id", "rating", "timestamp"], sep="\t")
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 49.1 ms, sys: 15.8 ms, total: 64.9 ms
Wall time: 1.99 s


In [3]:
topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [4]:
%%time
matrix_data = csr_matrix((dataset.rating, (dataset.user_id, dataset.item_id)))
nmf = NMF(n_components=latent)
W = nmf.fit_transform(matrix_data)
H = nmf.components_



CPU times: user 6.41 s, sys: 5.01 s, total: 11.4 s
Wall time: 9.44 s




In [5]:
print(W.shape)
print(H.shape)

(944, 50)
(50, 1683)


In [6]:
WH = np.dot(W, H)

In [7]:
WH[1:5, 1:5]

array([[5.11250520e+00, 1.87847686e+00, 1.33275442e+00, 2.86954925e+00],
       [2.06567769e+00, 9.02973695e-03, 1.70733394e-01, 6.34839375e-02],
       [1.77091129e-01, 1.59286643e-02, 1.10084380e-04, 4.73242662e-02],
       [2.63734308e-01, 0.00000000e+00, 3.33068008e-02, 2.04001394e-01]])

In [8]:
recommend_matrix = np.where(matrix_data.toarray(), 0 ,WH)

In [9]:
matrix_data.toarray()[1:5, 1:5]

array([[5, 3, 4, 3],
       [4, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int64)

In [10]:
recommend_matrix[1:5, 1:5]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 9.02973695e-03, 1.70733394e-01, 6.34839375e-02],
       [1.77091129e-01, 1.59286643e-02, 1.10084380e-04, 4.73242662e-02],
       [2.63734308e-01, 0.00000000e+00, 3.33068008e-02, 2.04001394e-01]])

In [11]:
%%time
df_recommend_list = pd.DataFrame(columns=['user_id', 'item_id', 'score', 'rank'])
for user_id in uq_users:
  item_scores = recommend_matrix[user_id]
  df_recommend = pd.DataFrame()
  df_recommend['user_id'] = [user_id] * topk
  df_recommend['item_id'] = np.argsort(item_scores)[::-1][:topk]
  df_recommend['score'] = np.sort(item_scores)[::-1][:topk]
  df_recommend["rank"] = rank_list
  df_recommend_list = df_recommend_list.append(df_recommend, ignore_index=True)

CPU times: user 6.75 s, sys: 1.7 ms, total: 6.75 s
Wall time: 13.6 s


In [12]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,285,4.171114,1
1,1,423,4.107745,2
2,1,655,3.692464,3
3,1,408,3.579033,4
4,1,276,3.390573,5
...,...,...,...,...
9425,943,651,2.826533,6
9426,943,82,2.709949,7
9427,943,33,2.601421,8
9428,943,684,2.551532,9
