In [141]:
import numpy as np
import pandas as pd
import os
from scipy.sparse.linalg import svds
from surprise import Reader, Dataset, SVD, evaluate
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from scipy.sparse import csr_matrix

In [142]:
dataset_path = os.path.join(os.environ['PWD'],'movielens_dataset')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

In [143]:
movies = pd.read_csv(dataset_path+'/'+movies_filename)
ratings = pd.read_csv(dataset_path+'/'+ratings_filename)

In [144]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [145]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [146]:
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [147]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
#Normalization
R = Ratings.values
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)
print(Ratings_demeaned)

[[ 3.89582476 -0.10417524  3.89582476 ... -0.10417524 -0.10417524
  -0.10417524]
 [-0.01177499 -0.01177499 -0.01177499 ... -0.01177499 -0.01177499
  -0.01177499]
 [-0.00976964 -0.00976964 -0.00976964 ... -0.00976964 -0.00976964
  -0.00976964]
 ...
 [ 2.23215755  1.73215755  1.73215755 ... -0.26784245 -0.26784245
  -0.26784245]
 [ 2.98755656 -0.01244344 -0.01244344 ... -0.01244344 -0.01244344
  -0.01244344]
 [ 4.50611888 -0.49388112 -0.49388112 ... -0.49388112 -0.49388112
  -0.49388112]]


In [149]:
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print('Sparsity: ' +  str(sparsity * 100) + '%')

Sparsity: 98.3%


In [150]:
U, sigma, Vt = svds(Ratings_demeaned, k = 50)
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(610, 50)
(50,)
(50, 9724)


In [151]:
sigma = np.diag(sigma)
print(sigma.shape)

(50, 50)


In [152]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [153]:
preds = pd.DataFrame(all_user_predicted_ratings,columns = Ratings.columns)
test_preds = pd.DataFrame(all_user_predicted_ratings, index = Ratings.index,columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.18343,-0.062473,0.083037,0.024158,0.04933,-0.15253,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.27021,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.0055,0.0055,0.004836,0.0055,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.25493,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [154]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [160]:
already_rated, predictions = recommend_movies(preds, 30, movies, ratings, 15)

User 30 has already rated 34 movies.
Recommending highest 15 predicted ratings movies not already rated.


In [161]:
already_rated.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,30,110,5.0,1500370456,Braveheart (1995),Action|Drama|War
16,30,58559,5.0,1500370398,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
31,30,115617,5.0,1500370457,Big Hero 6 (2014),Action|Animation|Comedy
30,30,112852,5.0,1500370426,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi
29,30,111759,5.0,1500370446,Edge of Tomorrow (2014),Action|Sci-Fi|IMAX
28,30,109487,5.0,1500370443,Interstellar (2014),Sci-Fi|IMAX
25,30,96821,5.0,1500370373,"Perks of Being a Wallflower, The (2012)",Drama|Romance
24,30,95510,5.0,1500370369,"Amazing Spider-Man, The (2012)",Action|Adventure|Sci-Fi|IMAX
23,30,93510,5.0,1500370378,21 Jump Street (2012),Action|Comedy|Crime
22,30,91529,5.0,1500370452,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX


In [157]:
predictions

Unnamed: 0,movieId,title,genres
2214,2959,Fight Club (1999),Action|Crime|Drama|Thriller
7671,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
2662,3578,Gladiator (2000),Action|Adventure|Drama
1492,2028,Saving Private Ryan (1998),Action|Drama|War
506,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
311,356,Forrest Gump (1994),Comedy|Drama|Romance|War
959,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
893,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
255,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
7281,76093,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX


In [158]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [159]:
print('MF using SVD CF Ratings Predictor RMSE: ' + str(rmse(test_preds.values, Ratings.values)))

MF using SVD CF Ratings Predictor RMSE: 1.996547211642056
