In [1]:
import numpy as np
import pandas as pd
import time
import os
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise.model_selection import cross_validate

# Read the file
data = pd.io.parsers.read_csv('ratings-oficial.dat', 
    names=['user_id', 'movie_id', 'rating'],
    engine='python', delimiter='::')

movie_data = pd.io.parsers.read_csv('tv-shows-oficial.dat',
    names=['movie_id', 'title'],
    engine='python', delimiter=',')




In [2]:
# Create the ratings matrix of shape (𝑚×𝑢) with rows as movies and columns as users
ratings_mat = np.ndarray(
    shape=(np.max(data.movie_id.values), np.max(data.user_id.values)),
    dtype=np.uint8)
ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values


In [3]:
# movie x users
ratings_mat.shape

(1178, 610)

In [4]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

print(normalised_mat)

[[ 84.86885246 -10.13114754 -14.13114754 ... -14.13114754  -9.13114754
  -11.13114754]
 [-13.61639344 -13.61639344 -13.61639344 ...  -9.61639344 -13.61639344
   -8.61639344]
 [-15.72786885 -15.72786885 -15.72786885 ... -15.72786885 -15.72786885
  -12.72786885]
 ...
 [ -0.50655738  -0.50655738  -0.50655738 ...   3.49344262  -0.50655738
   -0.50655738]
 [ -0.51147541  -0.51147541  -0.51147541 ...   3.48852459  -0.51147541
   -0.51147541]
 [ -0.51803279  -0.51803279  -0.51803279 ...  -0.51803279  -0.51803279
    3.48196721]]


In [5]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)


In [6]:
def top_cosine_similarity(data, movie_id):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
#     return sort_indexes[:top_n]
    return sort_indexes[1:11]

def get_movie_id(movie_title):
    movie_id = movie_data[movie_data.title == title].movie_id.values[0]
    return movie_id

# Helper function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Baseado no seu gosto por "{}" você deveria assistir:'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    i = 0
    for id in top_indexes + 1:
        i = i + 1
        print('{0}: {1}'.format(i, movie_data[movie_data.movie_id == id].title.values[0]))

In [7]:
title = '90210'

movie_id = get_movie_id(title)
k = 50
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id)

start_time = time.time()
print_similar_movies(movie_data, movie_id, indexes)

print ('\nTotal Runtime: {:.2f} seconds'.format(time.time() - start_time))

Baseado no seu gosto por "90210" você deveria assistir:
1: Mad About You
2: Ghost Hunters
3: Man with a Plan
4: Sleepy Hollow
5: Mom
6: Entourage
7: Downton Abbey
8: How It's Made
9: Full House
10: Triage X

Total Runtime: 0.03 seconds
