# Trabalho de Filtragem Colaborativa

Modelo de filtragem colaborativa usa as informações de ratings dos usuários para prover recomendações. <br>
**Ideia Principal:** A semelhança entre os usuários a partir do que se observa da interseção dos seus ratings permite inferir que os dados não inputados por apenas um destes poderia ter a mesma semelhança com relação ao outro. <br>
**Problema Principal:** A esparsidade dos dados, afinal a informação que um usuário provê é normalmente a um subconjunto muito pequeno dos itens. Logo a maioria da base de dados é de dados *faltantes* ou *não observados*. <br>
<br>
Há 2 métodos de filtragem colaborativa: <br>

**Memory-Based:** Também chamado de *neighborhood-based collaborative filtering algorithms*. Que se dividem basicamente em *user-based collaborative filtering* e *item-based collaborative filtering*. <br>
**Model-Based:** Modelos baseados em **machine learning** e **data mining** há um processo de aprendizado prévio para parametrizição. Alguns métodos são Decisions Trees, métodos bayesianos, modelos baseados em regras e latent factor method. <br>

## Implementação usando MovieLens
Carregar a base de 25M ou 100K


In [235]:
import pandas as pd         #DataFrames e operações associadas
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity          #Similaridade
import math  
import sklearn.metrics  

small = False  #usar a base de ratings reduzida ou não
ratings = pd.DataFrame()
movies = pd.DataFrame()

if (small):
    ratings = pd.read_csv('ratings_small.csv')
    movies = pd.read_csv('movies_small.csv').set_index("movieId")
else:
    ratings = pd.read_csv('ratings.csv')[:100000] #ler as primeiras 2.000.000 linhas da base completa
    movies = pd.read_csv('movies.csv')
    movies = movies.set_index('movieId')

#### Funções para auxiliar
def listar_filmes_ja_vistos(usuario, matriz_filmes_X_usuarios):
    #filmes_ja_vistos_bin = matriz_filmes_X_usuarios.loc[usuario].gt(0)   #gerar array com o que usuário já deu rating: True ou False
    #return filmes_ja_vistos_bin.index[filmes_ja_vistos_bin].to_list() #com base no anterior, listar filmes que já viu        
    if(type(usuario)==list):
        filmes_usuario = matriz_filmes_X_usuarios.loc[usuario].sum(axis = 0)
    else: #if(type(usuario)==int):
        filmes_usuario = matriz_filmes_X_usuarios.loc[usuario]
    return filmes_usuario[filmes_usuario!=0].index.to_list()

def listar_filmes_avaliados(usuario_):
    if(type(usuario_)==list):
        return sorted(ratings[ratings['userId'].isin(usuario_)]['movieId'].unique().tolist())
    else:
        return ratings[ratings['userId']==usuario_]['movieId'].values.tolist()

def listar_filmes_nao_vistos(usuario, matriz_filmes_X_usuarios):
    if(type(usuario)==list):
        filmes_usuario = matriz_filmes_X_usuarios.loc[usuario].sum(axis = 0)
    else: 
        filmes_usuario = matriz_filmes_X_usuarios.loc[usuario]
    return filmes_usuario[filmes_usuario==0].index.to_list()

def eliminar_colunas_zeradas(matriz):
    return matriz.loc[:, (matriz != 0).any(axis=0)] #elimina todas as colunas cujos todos os valores são 0

def listar_nomes_filmes(indices):
    return movies.loc[indices]['title'].values.tolist()


In [236]:
n_ratings = len(ratings)
n_users = len(ratings['userId'].unique())
n_movies_avaliados = len(ratings['movieId'].unique())
print(f"Total de ratings: {n_ratings}")
print(f"Total de filmes: {len(movies)}")
print(f"Filmes avaliados: {n_movies_avaliados}")
print(f"Total de usuários: {n_users}")
print(f"Média de ratings/user: {round(n_ratings/n_users, 2)}")
print(f"Shape de Ratings: {ratings.shape}")
esparsidade = round(1.0 -n_ratings/float(n_users * n_movies_avaliados),3)
print(f"O nível de esparsidade do dataset é {esparsidade * 100}%")
ratings.sample(6).sort_index()


Total de ratings: 100000
Total de filmes: 62423
Filmes avaliados: 9786
Total de usuários: 757
Média de ratings/user: 132.1
Shape de Ratings: (100000, 4)
O nível de esparsidade do dataset é 98.7%


Unnamed: 0,userId,movieId,rating,timestamp
34325,261,4361,4.0,999649774
35240,277,2178,5.0,997050642
45818,368,6373,2.5,1407093443
60373,477,4886,4.0,1086625318
76509,590,2023,1.0,1238964915
79804,608,109633,3.5,1439867354


In [237]:
print("Relatório dos ratings dados pelos usuários:")
ratings.groupby('userId')['rating'].count().describe()

Relatório dos ratings dados pelos usuários:


count     757.000000
mean      132.100396
std       205.175858
min        20.000000
25%        35.000000
50%        64.000000
75%       148.000000
max      3212.000000
Name: rating, dtype: float64

## Gerar matriz Movies X User

In [238]:
#movies_X_users = ratings.pivot_table(index="userId", columns="movieId", values="rating", fill_value=0)
# A LINHA DE CIMA FAZ O MESMO QUE A DE BAIXO GASTANDO 15X MAIS TEMPO
moviesX_usersY = ratings.groupby(['userId', 'movieId'])['rating'].first().unstack(fill_value=0.0)
moviesX_usersY

movieId,1,2,3,4,5,6,7,8,9,10,...,204692,204698,204704,205054,205072,205106,205413,205499,205557,206272
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
754,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [239]:
titulos_users = moviesX_usersY.copy() #se não mandar uma cópia, titulo users passa ser outro nome da mesma matriz e altera ela diretamente
titulos_users.columns = movies.loc[moviesX_usersY.columns.values.tolist()].title.values.tolist()
titulos_users.head(3)


Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,In the Tall Grass,Joker (2019),Ready or Not (2019),Hustlers (2019),Zombieland: Double Tap (2019),Can You Keep a Secret? (2019),The Laurel-Hardy Murder Case (1930),Getting an Eyeful (1938),Tit for Tat (1935),Haunt (2019)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User-Based Collaborative Filtering

### Matriz de Similaridade por Usuário
A medida comumente usada é a similaridade do cosseno.
Essa medida de similaridade deve seu nome ao fato de ser igual ao cosseno do ângulo entre os dois vetores que estão sendo comparados:  vetores de similaridade de usuário (ou item) de ratings. Quanto menor o ângulo entre dois vetores, maior será o cosseno, resultando em um fator de similaridade mais alto. 

Dado 2 vetores, A e B, a similiridade por cosseno, cos($\theta$), é representada pelo produto escalar
$$\text{cosine similarity} =S_C (x,y):= \cos(\theta) = {\mathbf{x} \cdot \mathbf{y} \over \|\mathbf{x}\| \|\mathbf{y}\|} = \frac{ \sum\limits_{i=1}^{n}{x_i  y_i} }{ \sqrt{\sum\limits_{i=1}^{n}{x_i^2}}  \sqrt{\sum\limits_{i=1}^{n}{y_i^2}} }$$

In [240]:
users_cosine_array = cosine_similarity(moviesX_usersY)
users_cosine = pd.DataFrame(data=users_cosine_array, index=moviesX_usersY.index, columns=moviesX_usersY.index)
users_cosine.round(3).head()

userId,1,2,3,4,5,6,7,8,9,10,...,748,749,750,751,752,753,754,755,756,757
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.041,0.061,0.041,0.016,0.0,0.094,0.021,0.023,0.026,...,0.024,0.061,0.032,0.0,0.012,0.037,0.032,0.046,0.123,0.037
2,0.041,1.0,0.179,0.197,0.158,0.13,0.065,0.177,0.129,0.157,...,0.084,0.215,0.095,0.15,0.189,0.096,0.084,0.101,0.295,0.133
3,0.061,0.179,1.0,0.358,0.061,0.115,0.031,0.081,0.062,0.132,...,0.043,0.265,0.163,0.11,0.131,0.047,0.039,0.296,0.361,0.111
4,0.041,0.197,0.358,1.0,0.066,0.072,0.016,0.089,0.066,0.084,...,0.021,0.269,0.117,0.079,0.167,0.031,0.03,0.288,0.208,0.091
5,0.016,0.158,0.061,0.066,1.0,0.115,0.202,0.308,0.216,0.27,...,0.204,0.241,0.018,0.087,0.247,0.187,0.294,0.046,0.175,0.219


### Selecionar a similaridade desejada

Nota: se fazer média das notas primeiro e usar cosseno, obtem-se mesmos valores que dá fazendo pearson diretamente. 
```python
    movies_X_users = movies_X_users - np.asarray([(np.mean(movies_X_users, 1))]).T
```
Implementação a seguir de **Pearson** não é recomendada pois gasta esta biblioteca é menos otimizada gastando cerca de 5x da implementação acima de cosseno 

$$r_{xy} =\frac{\sum ^n _{i=1}(x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum ^n _{i=1}(x_i - \bar{x})^2} \sqrt{\sum ^n _{i=1}(y_i - \bar{y})^2}}$$

```python
    users_pearson = movies_X_users.T.corr(method='pearson') #pode usar tb kendall e spearman
```

### Pegar os k usuários mais similares ao Target selecionado

In [241]:
# DEPRECATED: APRESENTAVA ALGUNS ERRROS E VOLTAVA UMA LISTA
#def obter_mais_similares(target, matriz_similaridade, matriz_dados, k = 25):
#    todas_similaridades = matriz_similaridade.loc[target].to_numpy()     #criar um array com a linha do target na matrix de similaridades
#    k_mais_similares = matriz_dados.index[todas_similaridades.argpartition(-k)[-k-1:-1]] #seleciona os k com similaridade mais alta no vetor excluindo ele mesmo
#    return k_mais_similares 
def obter_mais_similares(target, matriz_similaridade, matriz_dados, k = 25):
    similares = matriz_similaridade.loc[target].sort_values(ascending=False).drop(target)
    similares = similares[similares!=0]
    return similares.iloc[:k]

### Matriz de filmes não vistos pelo usuário target e que receberam notas dos usuários mais similares

In [242]:
target = 1
usuarios_mais_similares = obter_mais_similares(1, users_cosine, moviesX_usersY, 25).index
#gerar matriz dos usuarios mais similares x filmes não assistidos ainda pelo usuário
usuarios_similares_X_filmes_nao_vistos = moviesX_usersY.loc[usuarios_mais_similares].drop(columns=listar_filmes_ja_vistos(1,moviesX_usersY)) 
usuarios_similares_X_filmes_nao_vistos = eliminar_colunas_zeradas(usuarios_similares_X_filmes_nao_vistos)
print('Matriz de filmes não vistos por usuários mais similares:',usuarios_similares_X_filmes_nao_vistos.shape)
usuarios_similares_X_filmes_nao_vistos.head(8)

Matriz de filmes não vistos por usuários mais similares: (25, 2328)


movieId,1,2,3,6,7,10,11,13,15,16,...,192245,192379,192385,192803,193537,193599,193687,193753,193960,195715
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
255,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,0.0,0.0,0.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,4.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,4.5,...,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0
55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
544,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Gerar a Recomendação de acordo com a nota dada pelos usuários similares

1. Para cada filme: 
2. Para cada usuário da lista de mais similar:
    1. Se nota foi dada: somar nota seguindo a fórmula
$$ notaMédia = {\sum coeficiente * nota \over \sum coeficiente} $$

In [243]:
def predizer_notas(target, filmes, matriz_similaridade = users_cosine, matriz_dados = moviesX_usersY, k=25, min_threshold=0): 
                                            # min_threshold: qtd mínima de notas para ser considerada ao recomendar um filme
    similares = obter_mais_similares(target, matriz_similaridade, matriz_dados, 25)

    resultado = pd.DataFrame(columns=filmes, index=['Nota Final', '# Notas'] ) 
    resultado.columns.name = 'movieId'
    for filme in filmes:
        numerador = 0
        denominador = 0
        qtd_notas = 0
        for similar in similares.index:
            nota = matriz_dados.loc[similar,filme]
            if (nota != 0):
                coeficiente = similares[similar]
                numerador += nota * coeficiente
                denominador += coeficiente
                qtd_notas += 1
        if (qtd_notas < min_threshold):
            resultado = resultado.drop([filme], axis=1) #se a qt de notas for menor que limiar, descartar coluna com informação daquele filme
        else: #se não, prencher a nota calculada da média ponderada e a qtd de notas dadas
            try:
                resultado.at['Nota Final',filme] = round(numerador/denominador,1)
            except:
                #print(f"filme:{filme}, numerador:{numerador}, denominador: {denominador}, qt. notas: {qtd_notas}.")
                resultado.at['Nota Final',filme] = 0
            resultado.at['# Notas',filme] = qtd_notas
    return resultado

In [244]:
qtd_sugestoes = 20 #qtd de sugestões para exibir na tela

filmes_vistos_pelo_usuario = listar_filmes_ja_vistos(target, moviesX_usersY) 
filmes_vistos_pelos_similares = listar_filmes_ja_vistos(usuarios_mais_similares.values.tolist(), moviesX_usersY)
filmes_a_avaliar = list(set(filmes_vistos_pelos_similares)-set(filmes_vistos_pelo_usuario))
recomendacao = predizer_notas(target, filmes_a_avaliar, users_cosine, moviesX_usersY, 25, 5)
#recomendacao
recomendacao.T.sort_values(by=['Nota Final','# Notas',],ascending=False).head(qtd_sugestoes).join(movies[['title']], on=["movieId"])


Unnamed: 0_level_0,Nota Final,# Notas,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1265,4.7,7,Groundhog Day (1993)
1200,4.7,5,Aliens (1986)
4262,4.7,5,Scarface (1983)
99114,4.6,6,Django Unchained (2012)
1036,4.6,5,Die Hard (1988)
1090,4.6,5,Platoon (1986)
2959,4.5,19,Fight Club (1999)
318,4.5,16,"Shawshank Redemption, The (1994)"
293,4.5,13,Léon: The Professional (a.k.a. The Professiona...
1193,4.5,13,One Flew Over the Cuckoo's Nest (1975)


## Avaliando a eficácia do método
1. Selecionar um target randômico
2. Selecionar alguns valores de notas dadas por ele
3. Tentar predizer sua nota com base nos seus similares, 
4. Calcular margem de erro 

Uma medida frequentemente usada na verificação da acurácia de modelos numéricos é o Erro Quadrático Médio (MSE na sigla em Inglês) como descrito, por exemplo, em Wilks (2006).MSE é sempre positivo. MSE = 0 indica simulação perfeita. MSE é definido por:
$$ MSE = \frac{1}{n} \Sigma_{i=1}^n({y}-\hat{y})^2 $$

Em adição ao MSE, a raiz quadrada de MSE, ou Raiz do Erro Quadrático Médio (RMSE em Inglês), é comumente usada para expressar a acurácia dos resultados numéricos com a vantagem de que RMSE apresenta valores do erro nas mesmas dimensões da variável analisada. O RMSE é definido por:
$$ RMSE = \sqrt{\frac{1}{n} \Sigma_{i=1}^n({y}-\hat{y})^2} $$

In [245]:
rnd_user = moviesX_usersY.sample()
rnd_user = eliminar_colunas_zeradas(rnd_user)
rnd_user

movieId,293,296,318,541,589,648,778,858,1036,1089,...,109487,111362,111759,112183,112852,115713,116797,116823,128360,164179
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
755,4.5,4.5,3.0,4.0,3.5,3.5,4.5,3.5,5.0,3.5,...,4.0,4.5,4.0,4.5,4.5,5.0,3.5,4.0,4.0,4.0


In [246]:
rnd_user_id = rnd_user.index.values[0]

filmes_assistidos = listar_filmes_ja_vistos(rnd_user_id, moviesX_usersY)
predicao = predizer_notas(rnd_user_id, filmes_assistidos, users_cosine, moviesX_usersY, 25, 0)
predicao


movieId,293,296,318,541,589,648,778,858,1036,1089,...,109487,111362,111759,112183,112852,115713,116797,116823,128360,164179
Nota Final,4.2,4.4,4.1,4.2,4.2,3.8,4.1,4.3,4.1,3.9,...,4.2,4.6,4.5,4.6,4.1,4.3,3.9,4.2,4.0,4.1
# Notas,11.0,18.0,19.0,12.0,12.0,3.0,9.0,15.0,10.0,16.0,...,17.0,4.0,8.0,4.0,15.0,9.0,11.0,3.0,6.0,6.0


In [247]:
def calcular_rmse(real, previsao):
    mse = sklearn.metrics.mean_squared_error(notasReais, notasPreditas)     
    #mse = np.square(np.subtract(notasReais,notasPreditas)).mean()      
    print("Erro Quadrático Médio (MSE):", mse)         
    return math.sqrt(mse)  

notasReais = rnd_user.values.tolist()[0]
notasPreditas = predicao.loc["Nota Final"].values.tolist()
print("Raiz do Erro Quadrático Médio :", calcular_rmse(notasReais, notasPreditas))  

Erro Quadrático Médio (MSE): 1.2695098039215686
Raiz do Erro Quadrático Médio : 1.1267252566271728


---
---

## Item-Based Collaborative Filtering

### Gerar matriz Users X Movies
Transposição da matriz que tinha usuários nas linhas e filmes nas colunas, para filmes nas linhas e usuários nas colunas

In [248]:
usersX_moviesY = moviesX_usersY.T
usersX_moviesY.head(4)

userId,1,2,3,4,5,6,7,8,9,10,...,748,749,750,751,752,753,754,755,756,757
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,4.0,3.0,4.0,0.0,0.0,4.0,0.0,3.5,...,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Matriz de Similaridade Item a Item
Similaridade por Cosseno dos filmes entre si

In [271]:
movies_cosine_array = cosine_similarity(usersX_moviesY)
movies_cosine = pd.DataFrame(data=movies_cosine_array, index=usersX_moviesY.index, columns=usersX_moviesY.index)
movies_cosine.head()
#movies_pearson = movies_users.corr(method='pearson')

movieId,1,2,3,4,5,6,7,8,9,10,...,204692,204698,204704,205054,205072,205106,205413,205499,205557,206272
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.310203,0.307192,0.134028,0.273595,0.300727,0.302314,0.06482,0.186057,0.310636,...,0.0,0.086092,0.05512,0.062994,0.05512,0.0,0.078743,0.078743,0.078743,0.05512
2,0.310203,1.0,0.161157,0.09732,0.183182,0.21563,0.188759,0.142771,0.183523,0.371874,...,0.0,0.035505,0.074329,0.0,0.074329,0.0,0.104061,0.104061,0.104061,0.074329
3,0.307192,0.161157,1.0,0.211848,0.408242,0.274867,0.370458,0.0,0.338879,0.161873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.134028,0.09732,0.211848,1.0,0.223733,0.146259,0.105589,0.0,0.267947,0.069379,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.273595,0.183182,0.408242,0.223733,1.0,0.244908,0.42007,0.039074,0.186507,0.129423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.151893,0.151893,0.151893,0.0


### Selecionar um usuário e analisar os filmes que ele não deu nota
Os targets serão os filmes que o usuário não deu nota. É analisado os k filmes mais similares ao que ele não viu, e destes, de acordo com as notas que o usuário deu, é calculado a nota estimada. Isto é feito para todos os filmes.
<br><br><br>
É POSSÍVEL MELHORAR O ALGORITMO NÃO APENAS PEGANDO OS K SIMILARES E FAZENDO INTERSESSAO COM AS NOTAS, MAS FAZER A INTERSESSAO PRIMEIRO E DEPOIS SELECIONAR OS K MAIS RELEVANTES??

<center><img src="img/item-based-cosseno-predicao.jpg" style="max-width: 40%"></center>

1. pegar um usuário e os filmes que ele não assistiu
2. pegar um filme que ele não assistiu e selecionar os K mais semelhantes & que o usuário deu nota
3. fazer a média ponderada entre as notas que ele deu pra estes filmes semelhantes para definir a nota nova faltante

In [281]:
def predizer_notas2(usuario, filmes, matriz_similaridade=movies_cosine, matriz_dados = usersX_moviesY, k=100, min_threshold=5):
    usuario = 1
    filmes_não_avaliados = listar_filmes_nao_vistos(usuario,moviesX_usersY) #O que essa essa matriz dados???

    #recomendacao = pd.DataFrame(columns=("movieId", "Nota", "Qt de Notas"))
    resultado = pd.DataFrame(columns=filmes, index=['Nota Final', '# Notas'] ) 
    resultado.columns.name = 'movieId'

    set_avaliados = set(listar_filmes_avaliados(usuario))

    for filme in filmes:  #pegamos um filme da lista e vemos os mais similares
        filmes_mais_similares = obter_mais_similares(filme, matriz_similaridade, matriz_dados,k)
        similares_vistos = list(set(filmes_mais_similares.index) & set_avaliados) #pegar os mais similares que já foram avaliados
        numerador = 0
        denominador = 0
        qtd_notas = 0
        for i in similares_vistos:
            coeficiente = matriz_similaridade[filme][i]
            nota = usersX_moviesY[usuario][i] 
            if (nota != 0):
                numerador += nota * coeficiente
                denominador += coeficiente
                qtd_notas += 1  
        if(qtd_notas < min_threshold):
            resultado = resultado.drop([filme], axis=1) #se a qt de notas for menor que limiar, descartar coluna com informação daquele filme
        else: #se não, prencher a nota calculada da média ponderada e a qtd de notas dadas
            try:
                resultado.at['Nota Final',filme] = round(numerador/denominador,1)
            except:
                #print(f"filme:{filme}, numerador:{numerador}, denominador: {denominador}, qt. notas: {qtd_notas}.")
                resultado.at['Nota Final',filme] = 0
            resultado.at['# Notas',filme] = qtd_notas
            #recomendacao.loc[len(recomendacao)] = [int(filme), round(numerador/denominador,2), qtd_notas]
    return resultado


filmes_não_avaliados = listar_filmes_nao_vistos(1, moviesX_usersY)
predizer_notas2(1, filmes_não_avaliados[:15000],movies_cosine,usersX_moviesY,200).T.join(movies[['title']], on=["movieId"]).sort_values(by=['Nota Final','# Notas','title'],ascending=False).head(20)


Unnamed: 0_level_0,Nota Final,# Notas,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6062,4.9,8,Lost in La Mancha (2002)
8364,4.9,8,Baadasssss! (How to Get the Man's Foot Outta Y...
56782,4.9,6,There Will Be Blood (2007)
6953,4.9,6,21 Grams (2003)
3679,4.9,5,"Decline of Western Civilization, The (1981)"
64839,4.8,8,"Wrestler, The (2008)"
92259,4.8,7,Intouchables (2011)
31410,4.8,7,"Downfall (Untergang, Der) (2004)"
44555,4.8,6,"Lives of Others, The (Das leben der Anderen) (..."
5891,4.8,6,I Spit on Your Grave (Day of the Woman) (1978)


In [272]:
rnd_user = moviesX_usersY.sample()
rnd_user = eliminar_colunas_zeradas(rnd_user)
rnd_user

movieId,6,50,253,260,296,356,480,589,593,608,...,122882,122904,134853,148626,152077,152081,156726,161634,164179,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
142,5.0,4.5,4.0,4.0,4.0,3.5,5.0,5.0,5.0,4.5,...,5.0,5.0,4.0,3.5,3.0,3.5,3.5,3.5,4.5,4.0


In [274]:
rnd_user_id = rnd_user.index.values[0]

filmes_assistidos = listar_filmes_ja_vistos(rnd_user_id, moviesX_usersY)
predicao = predizer_notas2(rnd_user_id, filmes_assistidos, movies_cosine, usersX_moviesY, 100, 0)
predicao

movieId,6,50,253,260,296,356,480,589,593,608,...,122882,122904,134853,148626,152077,152081,156726,161634,164179,168252
Nota Final,5.0,4.8,4.0,3.7,4.7,4.2,3.6,3.5,4.6,5.0,...,0,0,4.0,0,0,4.0,0,0,0,0
# Notas,1.0,4.0,2.0,5.0,3.0,4.0,3.0,3.0,2.0,2.0,...,0,0,1.0,0,0,1.0,0,0,0,0


In [275]:
def calcular_rmse(real, previsao):
    mse = sklearn.metrics.mean_squared_error(notasReais, notasPreditas)     
    #mse = np.square(np.subtract(notasReais,notasPreditas)).mean()      
    print("Erro Quadrático Médio (MSE):", mse)         
    return math.sqrt(mse)  

notasReais = rnd_user.values.tolist()[0]
notasPreditas = predicao.loc["Nota Final"].values.tolist()
print("Raiz do Erro Quadrático Médio :", calcular_rmse(notasReais, notasPreditas))  

Erro Quadrático Médio (MSE): 4.707150837988827
Raiz do Erro Quadrático Médio : 2.169596929844073


## Outra abordagem de item based (não faz parte da proposta do trabalho)

In [277]:
usuarioId_target = 1
usuario = moviesX_usersY.loc[usuarioId_target].sort_values(ascending=False)
rate_mais_alto = usuario.iloc[0]
filmes_mais_gosta = usuario[usuario >= rate_mais_alto].index.tolist()
assistiu_n_filmes = len(usuario[usuario > 0].index.to_list())
print(f"O usuário {usuarioId_target} assistiu {assistiu_n_filmes} filmes e deu nota {rate_mais_alto} para estes {len(filmes_mais_gosta)} filmes: ")


O usuário 1 assistiu 70 filmes e deu nota 5.0 para estes 17 filmes: 


### Similaridade dos filmes favoritos com os filmes não assistidos
Matriz onde cada um dos filmes favoritos é um índice e cada coluna é um filme que não foram assistidos ainda.

In [278]:
lista_filmes_ja_vistos = listar_filmes_ja_vistos(usuarioId_target,moviesX_usersY)
print(f"Usuario {usuarioId_target} já assistiu {len(lista_filmes_ja_vistos)} filmes.")
filmes_pro_usuario = movies_cosine.loc[filmes_mais_gosta].drop(columns=lista_filmes_ja_vistos)
filmes_pro_usuario = eliminar_colunas_zeradas(filmes_pro_usuario) #eliminar as colunas dos filmes que não similaridade nenhuma com nada
filmes_pro_usuario

Usuario 1 já assistiu 70 filmes.


movieId,1,2,3,4,5,6,7,8,9,10,...,203649,204352,204542,204692,204698,204704,205054,205072,205106,206272
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7361,0.308922,0.196811,0.067094,0.0,0.067018,0.157191,0.034088,0.0,0.063101,0.177757,...,0.0,0.0,0.070797,0.0,0.126146,0.0,0.101139,0.0,0.0,0.0
4144,0.046782,0.105855,0.0,0.0,0.0,0.079,0.0,0.0,0.0,0.074446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2692,0.273149,0.182707,0.110478,0.0,0.052364,0.20791,0.068579,0.0,0.073754,0.172477,...,0.107106,0.0,0.0,0.107106,0.042634,0.0,0.080329,0.0,0.0,0.0
3569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6711,0.28734,0.149066,0.07655,0.0,0.031731,0.146839,0.068864,0.059815,0.053051,0.19147,...,0.0,0.0,0.072662,0.0,0.057077,0.0,0.0,0.0,0.0,0.0
4325,0.083187,0.074392,0.0,0.0,0.0,0.070315,0.0,0.0,0.0,0.056569,...,0.0,0.0,0.0,0.0,0.206574,0.0,0.0,0.0,0.0,0.0
6016,0.157819,0.120748,0.054152,0.0,0.03275,0.15481,0.032647,0.0,0.030757,0.115739,...,0.10298,0.0,0.0,0.10298,0.06832,0.0,0.0,0.0,0.0,0.0
5767,0.027094,0.0,0.0,0.0,0.0,0.058016,0.0,0.0,0.0,0.051051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8327,0.0,0.073582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,0.063397,0.069081,0.0,0.0,0.0,0.114115,0.073929,0.0,0.111178,0.06322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Transformamos a matriz numa 1xN filmes com o valor máximo encontra de similaridade; e ordenamos essa matriz, selecionando os k filmes com maior similaridade apresentada <br>
Feito também um **join** com movies para mostrar o título

In [279]:
k = 20
#pegar a similaridade máxima que cada um dos filmes não vistos possui com os filmes já vistos
recomendacao = filmes_pro_usuario.max().sort_values(ascending=False).head(k)
recomendacao = pd.DataFrame(recomendacao).join(movies['title'], on='movieId')
recomendacao

Unnamed: 0_level_0,0,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7759,0.744951,Nostalghia (1983)
83803,0.744951,Day & Night (2010)
2931,0.727363,Time of the Gypsies (Dom za vesanje) (1989)
71438,0.707107,Still Walking (Aruitemo aruitemo) (2008)
116899,0.707107,Summer Days With Coo (2007)
96950,0.707107,Guilty (Présumé coupable) (2011)
26565,0.707107,Asterix in Britain (Astérix chez les Bretons) ...
118702,0.707107,Unbroken (2014)
118466,0.707107,"Kingdom of Dreams and Madness, The (2013)"
117438,0.707107,When Marnie Was There (2014)


In [280]:
recomendacao = recomendacao.rename(columns={'title': 'Recomendação', 0: 'Score'})
pq_vc_assistiu = []
nota_media = []
for id in recomendacao.index:
    pq_vc_assistiu.append(filmes_pro_usuario.index[filmes_pro_usuario[id] == recomendacao.loc[id][0]].tolist()[0])
    nota_media.append(ratings.groupby(['movieId'])['rating'].mean().loc[id])
recomendacao['Nota média'] = nota_media
recomendacao['Pq vc assistiu'] = listar_nomes_filmes(pq_vc_assistiu)
recomendacao

Unnamed: 0_level_0,Score,Recomendação,Nota média,Pq vc assistiu
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7759,0.744951,Nostalghia (1983),3.5,"Night, The (Notte, La) (1960)"
83803,0.744951,Day & Night (2010),3.5,"Night, The (Notte, La) (1960)"
2931,0.727363,Time of the Gypsies (Dom za vesanje) (1989),4.375,Underground (1995)
71438,0.707107,Still Walking (Aruitemo aruitemo) (2008),4.5,Dolls (2002)
116899,0.707107,Summer Days With Coo (2007),4.0,Dolls (2002)
96950,0.707107,Guilty (Présumé coupable) (2011),3.5,Dolls (2002)
26565,0.707107,Asterix in Britain (Astérix chez les Bretons) ...,3.0,Dolls (2002)
118702,0.707107,Unbroken (2014),2.5,Dolls (2002)
118466,0.707107,"Kingdom of Dreams and Madness, The (2013)",4.0,Dolls (2002)
117438,0.707107,When Marnie Was There (2014),4.0,Dolls (2002)


---
---

# SVD: Fatoração de Matriz
Devido a esparsidade do dataset, os métodos tradicionais de filtragem colaborativa podem não serem adequados a demanda de processamento. Uma forma de tratar é fazendo uso do algoritmo de **Singular Value Decomposition**, SVD.<br>
Neste algoritmo, a matriz é decomposta em  em outras 3 matrizes de menor dimensionalidade.
$$ A = USV^T$$
- A é a matriz original m x n
- U é uma matriz ortogonal m x n
- S é uma matriz diagona n x n
- V é uma matriz ortogonal n x n

https://heartbeat.comet.ml/recommender-systems-with-python-part-iii-collaborative-filtering-singular-value-decomposition-5b5dcb3f242b

https://www.kaggle.com/code/cast42/simple-svd-movie-recommender

In [None]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(moviesX_usersY.to_numpy(), k = 50) #o que são essas k features

print(f"Matriz original{moviesX_usersY.shape} decomposta em U{U.shape}, sigma {sigma.shape} e Vt{Vt.shape}.")

Matriz original(13322, 27321) decomposta em U(13322, 50), sigma (50,) e Vt(50, 27321).


In [None]:
sigma_diag_matrix=np.diag(sigma) #sigma é um array contendo a diagonal
all_user_predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = moviesX_usersY.columns, index=moviesX_usersY.index)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,208531,208545,208683,208715,208737,208787,208793,208795,208939,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.772561,-0.031004,-0.045287,-0.036903,0.009587,-0.207948,0.064104,-0.032135,-0.001484,0.12384,...,-0.000974,0.000111,0.002047,0.00024,-0.001915,-0.001339,-0.002163,0.002527,-0.001393,0.006462
2,4.280314,0.440407,0.044153,-0.013698,-0.120013,0.462136,0.19492,0.09333,-0.101376,0.448111,...,-0.002205,-0.001829,-0.011183,0.002208,-0.003763,-0.002593,0.001678,0.01695,-0.006913,0.006643
3,1.260915,0.471058,-0.260996,-0.13309,-0.017546,0.523084,0.170512,-0.146719,-0.173209,-0.390248,...,0.00239,0.007356,0.016333,-0.002372,0.020703,0.015702,-0.006491,-0.020402,0.011331,-0.002112
4,2.631603,0.220179,-0.178007,-0.029201,-0.084359,0.069683,0.065234,-0.031115,-0.009492,0.328353,...,0.010204,-0.00038,0.00721,0.003881,0.017068,0.00244,-0.006303,0.001462,0.002747,0.007366
5,4.305501,0.937399,1.366239,0.108312,1.150614,1.609428,1.093901,0.065092,0.444644,1.228014,...,-0.002498,-0.000333,-0.001014,-0.001421,-0.006514,0.001098,0.001582,-0.000176,-0.002648,0.002661


In [None]:
def get_high_recommended_movies(userId):
    movies_rated_by_user = moviesX_usersY.loc[userId]
    movies_high_rated_by_user =  movies_rated_by_user[movies_rated_by_user > 4.5].index
    movies_recommended_for_user = preds_df.loc[userId]
    movies_high_recommend_for_user = movies_recommended_for_user[movies_recommended_for_user > 4].index
    return list(set(movies_high_recommend_for_user) - set(movies_high_rated_by_user))

In [None]:
user = 1

rec = get_high_recommended_movies(user)
rec_ = pd.DataFrame(index=list(rec), columns=['Título', 'Nota'])
rec_.index.name='movieId'
rec_['Título'] = listar_nomes_filmes(rec)
for id in rec:
    rec_.at[id,'Nota'] = preds_df.loc[user,id]
rec_



  return movies.loc[indices]['title'].values.tolist()


Unnamed: 0_level_0,Título,Nota
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
