# Reference
* https://github.com/nikitaa30/Recommender-Systems/blob/master/matrix_factorisation_svd.py
* https://ohke.hateblo.jp/entry/2017/10/06/230000

In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import os
data_path = '../data/ml-25m/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings_500000.csv'

df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
#     movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
#     ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [2]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [4]:
df_ratings=df_ratings[:2000000]
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [5]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,206845,206861,207309,207367,207642,207890,208002,208080,208793,208939
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
R = df_movie_features.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [7]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [8]:
sigma = np.diag(sigma)

In [9]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [10]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,206845,206861,207309,207367,207642,207890,208002,208080,208793,208939
0,0.515167,-0.080313,-0.008541,-0.0488,-0.025946,-0.085236,0.127588,-0.051469,-0.014834,0.178013,...,0.019319,-0.002246,0.004274,0.0172,-0.007627,-0.013385,-0.007457,0.012339,-0.008083,-0.006794
1,3.869668,0.82094,0.158686,-0.150456,0.023594,0.281925,0.363198,0.062348,-0.047681,0.748787,...,-0.00512,0.023512,0.008299,0.018104,-0.000656,-0.012287,-0.005381,0.013593,0.028076,-0.028867
2,1.371256,0.704185,-0.909727,-0.045299,-0.419001,0.579195,0.507884,-0.02205,-0.433959,-1.088017,...,0.045973,-0.04155,-0.021525,-0.014149,-0.044618,-0.02763,-0.029925,-0.018819,-0.088327,-0.011344
3,2.763624,-0.135987,-0.378702,-0.049371,-0.114699,0.080881,0.157354,-0.091593,-0.039038,0.504809,...,-0.022066,0.044557,0.056673,0.037261,-0.006871,0.002234,0.01973,0.026221,-0.010265,-0.007886
4,3.988339,0.392482,1.546642,0.10956,1.271868,1.342249,0.964998,0.00171,0.451311,0.823653,...,0.032269,-0.021383,-0.015136,0.006288,-0.013406,-0.003555,-0.00998,0.004572,-0.013988,-0.017997


In [11]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
#     print(preds_df.iloc[user_row_number])
#     print(sorted_user_predictions)
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
#     print(user_full)
#     print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
#     print 'Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    #                left_on = 'movieId',
#                right_on = 'movieId').
# merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left').rename(columns = {user_row_number: 'Predictions'}).
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [12]:
already_rated, predictions = recommend_movies(preds_df, 330, df_movies, df_ratings, 10)
already_rated.head(10)
predictions

Unnamed: 0,movieId,title
436,457,"Fugitive, The (1993)"
361,380,True Lies (1994)
105,110,Braveheart (1995)
158,165,Die Hard: With a Vengeance (1995)
147,153,Batman Forever (1995)
561,588,Aladdin (1992)
284,296,Pulp Fiction (1994)
565,595,Beauty and the Beast (1991)
304,318,"Shawshank Redemption, The (1994)"
323,339,While You Were Sleeping (1995)
