In [36]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [37]:
data_dir = 'ml-25m/'
movies_df = pd.read_csv(data_dir + 'movies.csv')
ratings_df = pd.read_csv(data_dir + 'ratings.csv')
ratings_df = ratings_df.head(1000000) # Using the first million records due to runtime limitations

In [38]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [40]:
ratings_pivot_df = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
ratings_pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,207367,207405,207642,207830,207890,208002,208080,208737,208793,208939
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
ratings = ratings_pivot_df.values
ratings_mean = np.mean(ratings, axis=1).reshape(-1, 1)
ratings -= ratings_mean

In [42]:
U, sigma, V_t = svds(ratings, k=50)
sigma = np.diag(sigma)
preds = np.dot(np.dot(U, sigma), V_t) + ratings_mean
preds_df = pd.DataFrame(preds, columns=ratings_pivot_df.columns)

In [43]:
def get_movie_recs(preds_df, movies_df, ratings_df, user_id):
    user_idx = user_id - 1
    sorted_user_preds = preds_df.iloc[user_idx].sort_values(ascending=False)
    
    user_data = ratings_df[ratings_df.userId == user_id]
    user_data = user_data.merge(movies_df, how='left', on='movieId')
    user_data = user_data.sort_values(['rating'], ascending=False)
    print(f'User {user_id} has previously rated {len(user_data)} movies.')
    
    recs = movies_df[~movies_df['movieId'].isin(user_data['movieId'])]
    recs = recs.merge(pd.DataFrame(sorted_user_preds).reset_index(), how='left', on='movieId')
    recs = recs.sort_values(user_idx, ascending=False).iloc[:, :-1]
    
    return user_data, recs

In [44]:
user_id = 1
prev_ratings, preds = get_movie_recs(preds_df, movies_df, ratings_df, user_id)
display(prev_ratings.head(10))
display(preds.head(10))

User 1 has previously rated 70 movies.


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
18,1,2632,5.0,1147878248,"Saragossa Manuscript, The (Rekopis znaleziony ...",Adventure|Drama|Mystery
57,1,8327,5.0,1147879375,Dolls (2002),Drama|Romance
56,1,8154,5.0,1147868865,"Dolce Vita, La (1960)",Drama
48,1,7361,5.0,1147880055,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
41,1,6711,5.0,1147868622,Lost in Translation (2003),Comedy|Drama|Romance
37,1,6016,5.0,1147869090,City of God (Cidade de Deus) (2002),Action|Adventure|Crime|Drama|Thriller
33,1,5767,5.0,1147878729,Teddy Bear (Mis) (1981),Comedy|Crime
26,1,4325,5.0,1147878122,"Night, The (Notte, La) (1960)",Drama
24,1,4144,5.0,1147868898,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance


Unnamed: 0,movieId,title,genres
758,778,Trainspotting (1996),Comedy|Crime|Drama
6709,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
6985,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4097,4226,Memento (2000),Mystery|Thriller
4857,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
2846,2959,Fight Club (1999),Action|Crime|Drama|Thriller
7249,7438,Kill Bill: Vol. 2 (2004),Action|Drama|Thriller
4176,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
4713,4848,Mulholland Drive (2001),Crime|Drama|Film-Noir|Mystery|Thriller
4751,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
