In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from imdb import IMDb
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 5)

Ratings Data File Structure (ratings.csv)
-----------------------------------------

All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:

    userId,movieId,rating,timestamp

The lines within this file are ordered first by userId, then, within user, by movieId.

Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.

In [2]:
ratings = pd.read_csv('ml-latest-small/ratings.csv').drop('timestamp', 1)
display(ratings)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
...,...,...,...
100002,671,6385,2.5
100003,671,6565,3.5


Links Data File Structure (links.csv)
---------------------------------------

Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:

    movieId,imdbId,tmdbId

movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.

imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.

tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.

Use of the resources listed above is subject to the terms of each provider.

In [3]:
links = pd.read_csv('ml-latest-small/links.csv').drop('tmdbId', 1)
display(links)

Unnamed: 0,movieId,imdbId
0,1,114709
1,2,113497
...,...,...
9123,164977,27660
9124,164979,3447228


## Criação da tabela esparsa pivô

In [4]:
pt = ratings.pivot_table(index='userId', columns='movieId', values='rating')
pt = pt.apply(lambda x: x-np.mean(x), axis=1).fillna(0).T
display(pt)

userId,1,2,...,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,0.0,...,0.193548,1.082609
2,0.0,0.0,...,0.000000,0.000000
...,...,...,...,...,...
162672,0.0,0.0,...,0.000000,0.000000
163949,0.0,0.0,...,0.000000,0.000000


In [5]:
pt_sparse = sp.sparse.csr_matrix(pt.values)
display(pt_sparse)

<9066x671 sparse matrix of type '<class 'numpy.float64'>'
	with 99621 stored elements in Compressed Sparse Row format>

Utilizando a similariadade de coseno na matriz
--
Na matriz gerada, o valor da linha *i* com a coluna *j* mostra a similaridade do filme de id *i* com o filme de id *j*.

Quanto mais próximo de 1, mais similar são os filmes. O valor máximo é 1.

In [6]:
item_df = pd.DataFrame(cosine_similarity(pt_sparse), index=pt.index, columns=pt.index)
display(item_df)

movieId,1,2,...,162672,163949
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.000000,-0.042287,...,0.0,0.009536
2,-0.042287,1.000000,...,0.0,0.000000
...,...,...,...,...,...
162672,0.000000,0.000000,...,1.0,0.000000
163949,0.009536,0.000000,...,0.0,1.000000


## Funções auxiliares

In [7]:
get_imdb_id = lambda movie_id: str(links[links['movieId'] == movie_id]['imdbId'].values[0]).rjust(7, '0')
display(get_imdb_id(1))

'0114709'

In [8]:
get_imdb_link = lambda movie_id: 'http://www.imdb.com/title/tt'+get_imdb_id(movie_id)
display(get_imdb_link(1))

'http://www.imdb.com/title/tt0114709'

Gerando a recomendação
--
Retorna uma lista com o link para o imdb com cinco filmes similares ao filme passado como parâmetro

In [9]:
def recommend(movie_id):
    return [get_imdb_link(movie) for movie in item_df.sort_values(by=movie_id, ascending=False).index[1:6]]
display(recommend(1))

['http://www.imdb.com/title/tt0120363',
 'http://www.imdb.com/title/tt0435761',
 'http://www.imdb.com/title/tt0120623',
 'http://www.imdb.com/title/tt0110074',
 'http://www.imdb.com/title/tt0317705']