In [1]:
import numpy as np
import pandas as pd

In [9]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("E:\\Sistemas_de_Recomendação\\movies\\u.data", sep='\t', names=column_names)

In [10]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [13]:
movie_titles = pd.read_csv("E:\\Sistemas_de_Recomendação\\movies\\Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [14]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [15]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 944
Num of Movies: 1682


In [16]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

In [17]:
# Crie duas matrizes de item de usuário, uma para treinamento e outra para teste
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [18]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [19]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [20]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [27]:
print('User-based: ' + str(user_prediction))
print('Item-based: ' + str(item_prediction))

User-based: [[ 1.63575754  0.60759848  0.50243992 ...  0.30699448  0.30940087
   0.30926665]
 [ 1.32216722  0.30116627  0.15131017 ... -0.07330853 -0.0702123
  -0.06986846]
 [ 1.35206051  0.26406747  0.1238314  ... -0.1077617  -0.10455372
  -0.10431668]
 ...
 [ 1.39049687  0.33080638  0.21758548 ... -0.0076045  -0.00468017
  -0.00449169]
 [ 1.44474337  0.40153581  0.30579463 ...  0.10846707  0.110937
   0.11091539]
 [ 1.23222386  0.19045537  0.06713313 ... -0.15832807 -0.15537104
  -0.15534603]]
Item-based: [[0.38192631 0.39347228 0.40598547 ... 0.45746579 0.44931609 0.44738235]
 [0.07876842 0.0930921  0.08869293 ... 0.09518144 0.09437616 0.09563853]
 [0.05840639 0.0616794  0.06005346 ... 0.06186794 0.06173898 0.06261576]
 ...
 [0.12834867 0.13462025 0.14566818 ... 0.15288519 0.15123007 0.15253254]
 [0.21047078 0.20323612 0.22586656 ... 0.26115407 0.25421023 0.25450493]
 [0.00392803 0.0047738  0.00562334 ... 0.00654372 0.00624896 0.00627307]]


In [21]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [22]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1298020570955005
Item-based CF RMSE: 3.455706786041221


In [23]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


In [24]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7200190281303795
