<a href="https://colab.research.google.com/github/vimigueloli/Sistemas-de-recomendacao/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Colaborative filtering é a tecnica de filtrar conteudo baseado em informasções fornecidas pelo usuário

In [91]:
# importar os dados para os arquivos do colab
import pandas as pd

## armazena os filmes em uma variavel
filmes = pd.read_csv("movies.csv")
filmes.columns = ["filmeId", "filme", "genero"]

##armazena as notas dos filmes em uma variavel
notas = pd.read_csv("ratings.csv")
notas.columns = ["usuario", "filmeId", "nota", "momento"]
notas.head()

Unnamed: 0,usuario,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [92]:
# função para extrair informações da variavel pandas
notas.describe()

Unnamed: 0,usuario,filmeId,nota,momento
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [93]:
# contando a quantidade de notas dadas
votos = notas["filmeId"].value_counts()
votos.head()

356     329
318     317
296     307
593     279
2571    278
Name: filmeId, dtype: int64

In [94]:
# configurando o id dos filmes como index da variavel pandas e localizando um item
filmes = filmes.set_index("filmeId")
filmes.loc[356]

filme          Forrest Gump (1994)
genero    Comedy|Drama|Romance|War
Name: 356, dtype: object

In [95]:
# coloca a quantidade de votos como uma coluna na tabela de filmes
filmes["quantia_de_votos"] = votos
filmes.head()

Unnamed: 0_level_0,filme,genero,quantia_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0


In [96]:
# ordena os filmes por quantidade de votos de forma decrescente
filmes.sort_values("quantia_de_votos", ascending= False)

Unnamed: 0_level_0,filme,genero,quantia_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
...,...,...,...
30892,In the Realms of the Unreal (2004),Animation|Documentary,
32160,Twentieth Century (1934),Comedy,
32371,Call Northside 777 (1948),Crime|Drama|Film-Noir,
34482,"Browning Version, The (1951)",Drama,


In [97]:
# pega as notas feitas agrupa e calcula a media delas
nota_media = notas.groupby("filmeId").mean()["nota"]
nota_media.head()

filmeId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: nota, dtype: float64

In [98]:
# adiciona a coluna de notas aos filmes
filmes["nota"] = nota_media
filmes.head()

Unnamed: 0_level_0,filme,genero,quantia_de_votos,nota
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429


In [99]:
# recomenda os filmes com maior nota media com mais de 50 votos
filmes_recomendados= filmes.query("quantia_de_votos >= 50").sort_values("nota", ascending = False)
filmes_recomendados.head(10)

Unnamed: 0_level_0,filme,genero,quantia_de_votos,nota
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062
2959,Fight Club (1999),Action|Crime|Drama|Thriller,218.0,4.272936
1276,Cool Hand Luke (1967),Drama,57.0,4.27193
750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97.0,4.268041
904,Rear Window (1954),Mystery|Thriller,84.0,4.261905
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969
48516,"Departed, The (2006)",Crime|Drama|Thriller,107.0,4.252336
1213,Goodfellas (1990),Crime|Drama,126.0,4.25
912,Casablanca (1942),Drama|Romance,100.0,4.24


In [100]:
#seleciona um grupo de filmes assistidos pelo usuário
assistidos = [1,15,318,500,2000]
filmes_assistidos = filmes.loc[assistidos]
filmes_assistidos.head()

Unnamed: 0_level_0,filme,genero,quantia_de_votos,nota
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
15,Cutthroat Island (1995),Action|Adventure|Romance,13.0,3.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
500,Mrs. Doubtfire (1993),Comedy|Drama,144.0,3.388889
2000,Lethal Weapon (1987),Action|Comedy|Crime|Drama,75.0,3.673333


In [101]:
# dos filmes recomendados seleciona filmes com um genero similar aos assistidos pelo usuário
## ja fica ordenado pela nota devido a ordenação feita em filmes_recomendados
filmes_indicados = filmes_recomendados.query("genero == 'Crime|Drama'")
filmes_indicados.head(10)

Unnamed: 0_level_0,filme,genero,quantia_de_votos,nota
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969
1213,Goodfellas (1990),Crime|Drama,126.0,4.25
2329,American History X (1998),Crime|Drama,129.0,4.217054
3147,"Green Mile, The (1999)",Crime|Drama,111.0,4.148649
16,Casino (1995),Crime|Drama,82.0,3.926829
5989,Catch Me If You Can (2002),Crime|Drama,115.0,3.921739
55820,No Country for Old Men (2007),Crime|Drama,64.0,3.898438
33166,Crash (2004),Crime|Drama,50.0,3.89


In [102]:
# indica filmes baseado no genero de um filme ja assistido pelo usuário
filmes_indicados = filmes_indicados.drop(assistidos, errors='ignore')
filmes_indicados.head(10)

Unnamed: 0_level_0,filme,genero,quantia_de_votos,nota
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969
1213,Goodfellas (1990),Crime|Drama,126.0,4.25
2329,American History X (1998),Crime|Drama,129.0,4.217054
3147,"Green Mile, The (1999)",Crime|Drama,111.0,4.148649
16,Casino (1995),Crime|Drama,82.0,3.926829
5989,Catch Me If You Can (2002),Crime|Drama,115.0,3.921739
55820,No Country for Old Men (2007),Crime|Drama,64.0,3.898438
33166,Crash (2004),Crime|Drama,50.0,3.89
36,Dead Man Walking (1995),Crime|Drama,67.0,3.835821


Procurando usuários similares

In [103]:
notas.head()

Unnamed: 0,usuario,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [104]:
def notas_by_user(user):
  avaliacoes_do_usuario = notas.query("usuario == %d" % user)
  avaliacoes_do_usuario = avaliacoes_do_usuario[["filmeId", "nota"]]
  return avaliacoes_do_usuario.set_index("filmeId")

In [110]:
user1 = notas_by_user(10)
user2 = notas_by_user(5)
compare = user1.join(user2, lsuffix="1", rsuffix="2").dropna()

Unnamed: 0_level_0,nota1,nota2
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1
296,1.0,5.0
588,4.0,4.0
597,3.5,3.0


In [116]:
import numpy as np
distancia = np.linalg.norm(compare["nota1"] - compare["nota2"])
distancia

4.031128874149275

In [117]:
def distancia_usuarios(a,b):
  user1 = notas_by_user(a)
  user2 = notas_by_user(b)
  compare = user1.join(user2, lsuffix="1", rsuffix="2").dropna()
  return np.linalg.norm(compare["nota1"] - compare["nota2"])

In [118]:
distancia_usuarios(1,2)

1.4142135623730951