In [1]:
import pandas as pd
import numpy as np


In [2]:
ratings_sml = pd.read_csv('Data/ratings_sml.csv')

In [3]:
print(ratings_sml.shape)
ratings_sml.head()

(2892878, 5)


Unnamed: 0,userId,num_user_rated,movieId,Title,rating
0,33455,150,110,Braveheart,5.0
1,33455,150,2959,Fight Club,4.5
2,33455,150,4226,Memento,4.5
3,33455,150,4878,Donnie Darko,4.5
4,33455,150,5577,Igby Goes Down,4.0


In [4]:
n_users = ratings_sml.userId.unique().shape[0]
n_movies = ratings_sml.movieId.unique().shape[0]
print (f"Number of users = {n_users} Number of movies = {n_movies}")

Number of users = 27585 Number of movies = 11102


In [5]:
# pivot ratings into movie features
ratings_matrix = ratings_sml.pivot(index = 'userId', 
                                   columns = 'movieId', values = 'rating').fillna(0)
ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,175991,175993,175995,175999,176001,176003,176007,176157,176211,176271
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,3.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# normalize the data by each users mean and convert it from a dataframe to a numpy array
R = ratings_matrix.values
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)


In [7]:
# Compute the largest k singular values/vectors for a sparse matrix
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)


In [8]:
# convert sigma to a diagonal matrix
sigma = np.diag(sigma)


In [9]:
# calculate the dot product of our vectors
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)


In [10]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = ratings_matrix.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,175991,175993,175995,175999,176001,176003,176007,176157,176211,176271
0,2.560615,0.031274,-0.651091,0.02062,-0.533642,0.264343,-0.504495,-0.057654,-0.134222,0.201482,...,0.001771,0.001789,0.001807,0.001807,0.001807,0.001824,0.001754,0.001644,0.002967,0.002237
1,0.441066,-0.297052,0.211197,-0.065219,0.153935,2.553422,0.270768,-0.053733,0.102689,0.535472,...,-0.001425,-0.001077,-0.000729,-0.000729,-0.000729,-0.000382,-0.001773,-0.000501,-0.003033,-0.002046
2,-0.352039,-0.094756,-0.192779,0.06451,-0.023931,0.06482,-0.130847,0.012888,-0.102441,-0.70903,...,0.007978,0.007942,0.007906,0.007906,0.007906,0.00787,0.008014,0.009173,0.005091,0.006427
3,1.296987,1.336419,-0.154512,-0.097812,-0.045746,3.126836,-0.138816,-0.105895,0.094546,1.727142,...,-0.00014,-4.2e-05,5.6e-05,5.6e-05,5.6e-05,0.000154,-0.000238,0.002996,0.000401,-0.000531
4,4.743518,0.725185,-0.024024,-0.112855,0.097943,0.340626,0.199099,-0.030649,-0.011594,1.09807,...,0.001233,0.001167,0.001102,0.001102,0.001102,0.001036,0.001298,-3.7e-05,0.001855,0.002679


In [13]:
# convert preds from float64 to float32
preds.astype(np.float32)

# convert all negatives values to zero 
preds[preds < 0] = 0
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,175991,175993,175995,175999,176001,176003,176007,176157,176211,176271
0,2.560615,0.031274,0.0,0.02062,0.0,0.264343,0.0,0.0,0.0,0.201482,...,0.001771,0.001789,0.001807,0.001807,0.001807,0.001824,0.001754,0.001644,0.002967,0.002237
1,0.441066,0.0,0.211197,0.0,0.153935,2.553422,0.270768,0.0,0.102689,0.535472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.06451,0.0,0.06482,0.0,0.012888,0.0,0.0,...,0.007978,0.007942,0.007906,0.007906,0.007906,0.00787,0.008014,0.009173,0.005091,0.006427
3,1.296987,1.336419,0.0,0.0,0.0,3.126836,0.0,0.0,0.094546,1.727142,...,0.0,0.0,5.6e-05,5.6e-05,5.6e-05,0.000154,0.0,0.002996,0.000401,0.0
4,4.743518,0.725185,0.0,0.0,0.097943,0.340626,0.199099,0.0,0.0,1.09807,...,0.001233,0.001167,0.001102,0.001102,0.001102,0.001036,0.001298,0.0,0.001855,0.002679


In [14]:
# add user id to our 'preds' matrix from ratings_matrix 
id = ratings_matrix.index
preds2 = preds.copy()

In [16]:
id = ratings_matrix.index
preds2['id'] = id
preds2.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,175993,175995,175999,176001,176003,176007,176157,176211,176271,id
0,2.560615,0.031274,0.0,0.02062,0.0,0.264343,0.0,0.0,0.0,0.201482,...,0.001789,0.001807,0.001807,0.001807,0.001824,0.001754,0.001644,0.002967,0.002237,12
1,0.441066,0.0,0.211197,0.0,0.153935,2.553422,0.270768,0.0,0.102689,0.535472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15
2,0.0,0.0,0.0,0.06451,0.0,0.06482,0.0,0.012888,0.0,0.0,...,0.007942,0.007906,0.007906,0.007906,0.00787,0.008014,0.009173,0.005091,0.006427,16
3,1.296987,1.336419,0.0,0.0,0.0,3.126836,0.0,0.0,0.094546,1.727142,...,0.0,5.6e-05,5.6e-05,5.6e-05,0.000154,0.0,0.002996,0.000401,0.0,34
4,4.743518,0.725185,0.0,0.0,0.097943,0.340626,0.199099,0.0,0.0,1.09807,...,0.001167,0.001102,0.001102,0.001102,0.001036,0.001298,0.0,0.001855,0.002679,37


In [17]:
# export matrix to our Data dir ashdf format 
preds2.to_hdf('Data/predsfin_hdf.h5', key='preds',complib='blosc',complevel=9, mode='w')


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  f(store)


In [18]:
def recommend_movies(predictions, userId, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = preds2.loc[preds2['id']== userId].index[0]
    
    sorted_user_predictions = preds2.iloc[user_row_number].sort_values(ascending=False) 
    sorted_user_predictions = pd.DataFrame(sorted_user_predictions[1:]).reset_index()
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userId)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

#     print (f"User {0} has already rated {1} movies. format(userID, user_full.shape[0]")
#     print 'Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_sml[~movies_sml['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations]
                      )
    rec_movies = recommendations['Title']
    already_rated = user_full['Title_x']
    return  already_rated, rec_movies

In [19]:
already_rated, predictions = recommend_movies(preds, 33455, movies, ratings_sml, 10)

NameError: name 'movies' is not defined