# Simple Movie Recommender Using SVD



In [1]:
# https://alyssaq.github.io/2015/20150426-simple-movie-recommender-using-svd/
import numpy as np
import pandas as pd

In [2]:
# Read the data with pandas.
data = pd.io.parsers.read_csv('data/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')
print(data.head(5))

movie_data = pd.io.parsers.read_csv('data/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')
print(movie_data.head(5))

   user_id  movie_id  rating       time
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291
   movie_id                               title                         genre
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


In [3]:
# Create the ratings matrix of shape (m×n) with rows as movies and columns as users
ratings_mat = np.ndarray(
    shape=(np.max(data.movie_id.values), np.max(data.user_id.values)),
    dtype=np.uint8)

ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values

print(ratings_mat)

[[5 0 0 ... 0 0 3]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
# Compute SVD
A = ratings_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

print(V)

[[-7.01371394e-02 -2.35438150e-02 -1.37658393e-02 ... -2.61526344e-03
  -1.16635687e-03 -1.32565863e-02]
 [-2.09401541e-02 -2.97924550e-02 -1.67038987e-02 ...  1.87440015e-03
   2.26511244e-03  5.02213333e-03]
 [ 3.01647236e-02 -1.01890706e-02  1.25724193e-02 ...  1.78319162e-03
   3.52091700e-03  2.23576838e-02]
 ...
 [ 0.00000000e+00 -3.08148791e-32 -1.17528849e-31 ... -1.35465097e-31
   9.24446373e-33  2.31111593e-33]
 [ 0.00000000e+00 -2.81185772e-32 -1.36833979e-32 ... -2.10263402e-31
   5.66704886e-32  1.30963236e-32]
 [ 0.00000000e+00  1.73472348e-17 -7.08802946e-17 ... -1.79977561e-17
   2.38524478e-17 -6.93889390e-17]]


In [5]:
# Calculate cosine similarity, sort by most similar and return the top N.
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Helper function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [9]:
# Select k principal components to represent the movies, a movie_id to find recommendations and print the top_n results.
k = 50
movie_id = 1 # Grab an id from movies.dat
top_n = 10

sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movie_data, movie_id, indexes)

  


KeyboardInterrupt: 