In [18]:
import numpy as np
import pandas as pd
import os
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from scipy.sparse import csr_matrix

In [10]:
ratings_csv = pd.read_csv("dataset/ratings_drop.csv")
books_csv = pd.read_csv("dataset/books_drop.csv")
users_csv = pd.read_csv("dataset/users.csv")
ratings_csv.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,260,5
2,2,26,4
3,2,315,3
4,2,33,4


In [11]:
Ratings = ratings_csv.pivot(index = 'user_id', columns = 'book_id', values = 'rating').fillna(0)
print(Ratings.shape)
Ratings.head()

(53404, 1000)


book_id,1,2,3,4,5,6,7,8,9,10,...,1423,1434,1435,1465,1498,1526,1530,1566,1681,1739
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0,5.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Normalization
R = Ratings.values
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)
print(Ratings_demeaned)

[[-0.315 -0.315 -0.315 ... -0.315 -0.315 -0.315]
 [-0.173  4.827 -0.173 ... -0.173 -0.173 -0.173]
 [-0.08  -0.08  -0.08  ... -0.08  -0.08  -0.08 ]
 ...
 [ 3.635  4.635 -0.365 ... -0.365 -0.365 -0.365]
 [ 3.79   4.79  -0.21  ... -0.21  -0.21  -0.21 ]
 [ 3.593  4.593  3.593 ... -0.407 -0.407 -0.407]]


In [15]:
n_users = ratings_csv.user_id.unique().shape[0]
n_movies = ratings_csv.book_id.unique().shape[0]
sparsity = round(1.0 - len(ratings_csv) / float(n_users * n_movies), 3)
print('Sparsity: ' +  str(sparsity * 100) + '%')

Sparsity: 94.0%


In [36]:
U, sigma, Vt = svds(Ratings_demeaned, k = 50)
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(53404, 50)
(50,)
(50, 1000)


In [37]:
sigma = np.diag(sigma)
print(sigma.shape)

(50, 50)


In [38]:
np.save('svd/U_svd',U)
np.save('svd/sigma_svd',sigma)
np.save('svd/Vt_svd',Vt)
np.save('svd/user_rating_mean', user_ratings_mean)

In [39]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [40]:
preds = pd.DataFrame(all_user_predicted_ratings, index = Ratings.index,columns = Ratings.columns)
preds.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,1423,1434,1435,1465,1498,1526,1530,1566,1681,1739
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.597243,-0.185541,0.334328,4.558122,1.339608,0.13317,-0.580192,0.813564,-0.040022,3.965342,...,0.074462,-0.031499,0.380873,0.310073,0.049295,-0.017033,0.023827,0.067558,0.216911,0.181918
2,-0.057847,3.587475,0.112793,0.867379,3.423247,0.473874,0.263777,2.674142,1.097552,2.809703,...,-0.051899,-0.060134,0.093488,0.125354,-0.020249,0.025562,-0.150718,0.007828,0.037436,0.092648
3,-0.041335,0.416593,-0.085658,2.537782,0.283053,-0.320999,0.008596,0.334389,-0.18586,0.252022,...,0.011061,0.034479,0.1043,0.162143,0.028918,0.119699,0.074165,0.115358,0.070155,0.084736
4,-0.235992,4.166215,-0.221033,4.157416,3.341856,0.045272,2.973134,4.411047,1.999579,3.357352,...,0.026815,0.117249,0.034523,0.426647,0.053862,0.261825,0.53053,0.234501,0.064621,0.288053
5,-0.12,0.290269,-0.029259,-0.151328,0.210757,2.200265,0.174477,0.103141,0.070978,0.272252,...,0.244988,0.056747,0.106963,0.042031,0.221937,0.03093,0.072577,0.079029,0.049231,0.060112
