# Pós-Graduação - Ciência de Dados & Big Data

## Pontifícia Universidade Católica de Minas Gerais (PUC-MG)

### Aluno: Victor Hugo Negrisoli

### Sistemas de Recomendação com ALS, Spark e MongoDB

In [14]:
from __future__ import print_function
import sys
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pymongo
from pymongo import MongoClient
import json
from bson.objectid import ObjectId

### Criando funções de configurações iniciais

In [90]:
host = 'localhost'
port = 27017
client = pymongo.MongoClient(host, port)
db = client.pos

In [91]:
def converter_long_para_int_python3():
    if sys.version >= '3':
        long = int

In [92]:
def remover_collection_se_existir():
    for i in db.suggestions.find():
        db.suggestions.drop()
        print('Collection removida!')
        break

In [93]:
def definir_configuracoes_iniciais():
    converter_long_para_int_python3()
    remover_collection_se_existir()

In [94]:
definir_configuracoes_iniciais()

In [95]:
spark = SparkSession\
            .builder\
            .appName("Pratica ALS")\
            .getOrCreate()

### Define leitura do arquivo e criação do RDD para o DataFrame do Spark

In [96]:
caminho_arquivo = "dados/sample_movielens_ratings.txt"
linhas = spark.read.text(caminho_arquivo).rdd
particoes = linhas.map(lambda linha: linha.value.split("::"))

avaliacoesRDD = particoes.map(lambda particao: Row(
    userId = int(particao[0]),
    movieId = int(particao[1]),
    rating = int(particao[2]),
    timestampId = long(particao[3])
))


In [133]:
avaliacoes = spark.createDataFrame(avaliacoesRDD)

In [98]:
avaliacoes.show(10)

+------+-------+------+-----------+
|userId|movieId|rating|timestampId|
+------+-------+------+-----------+
|     0|      2|     3| 1424380312|
|     0|      3|     1| 1424380312|
|     0|      5|     2| 1424380312|
|     0|      9|     4| 1424380312|
|     0|     11|     1| 1424380312|
|     0|     12|     2| 1424380312|
|     0|     15|     1| 1424380312|
|     0|     17|     1| 1424380312|
|     0|     19|     1| 1424380312|
|     0|     21|     1| 1424380312|
+------+-------+------+-----------+
only showing top 10 rows



In [99]:
type(avaliacoes)

pyspark.sql.dataframe.DataFrame

In [100]:
dataset_treino_oitenta_por_cento = 0.8
dataset_teste_vinte_por_cento = 0.2

lista_treino_teste = [dataset_treino_oitenta_por_cento, dataset_teste_vinte_por_cento]

(treino, teste) = avaliacoes.randomSplit(lista_treino_teste)

In [101]:
treino.show(10)

+------+-------+------+-----------+
|userId|movieId|rating|timestampId|
+------+-------+------+-----------+
|     0|      2|     3| 1424380312|
|     0|      3|     1| 1424380312|
|     0|      9|     4| 1424380312|
|     0|     11|     1| 1424380312|
|     0|     12|     2| 1424380312|
|     0|     15|     1| 1424380312|
|     0|     17|     1| 1424380312|
|     0|     19|     1| 1424380312|
|     0|     21|     1| 1424380312|
|     0|     23|     1| 1424380312|
+------+-------+------+-----------+
only showing top 10 rows



In [102]:
teste.show(10)

+------+-------+------+-----------+
|userId|movieId|rating|timestampId|
+------+-------+------+-----------+
|     0|      5|     2| 1424380312|
|     0|     27|     1| 1424380312|
|     0|     28|     1| 1424380312|
|     0|     41|     2| 1424380312|
|     0|     45|     2| 1424380312|
|     1|      3|     1| 1424380312|
|     1|     13|     1| 1424380312|
|     1|     21|     3| 1424380312|
|     1|     27|     1| 1424380312|
|     1|     36|     2| 1424380312|
+------+-------+------+-----------+
only showing top 10 rows



### Inicia o algoritmo ALS e cria o modelo

In [103]:
als = ALS(
    maxIter = 5,
    regParam = 0.01,
    userCol = "userId",
    itemCol = "movieId",
    ratingCol = "rating",
    coldStartStrategy = "drop"
)

In [104]:
modelo = als.fit(treino)
modelo

ALSModel: uid=ALS_7ee303504406, rank=10

In [105]:
predicoes = modelo.transform(teste)
avaliador = RegressionEvaluator(
    metricName = "rmse",
    labelCol = "rating",
    predictionCol = "prediction"
)
rmse = avaliador.evaluate(predicoes)

In [106]:
print('Análise do Root Mean Square Erro: {}'.format(rmse))

Análise do Root Mean Square Erro: 2.0282237655795714


### Recomendando 10 filmes para todos os usuários e 10 usuários para todos os filmes

In [134]:
recomendacoes_usuario = modelo.recommendForAllUsers(10)

In [108]:
recomendacoes_usuario.show(10, False)

+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                        |
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[[40, 5.8070765], [92, 4.9770613], [12, 4.936256], [49, 4.0690365], [70, 3.9151845], [2, 3.900339], [64, 3.7940161], [41, 3.672939], [9, 3.522714], [25, 3.3303733]]   |
|26    |[[22, 5.4670906], [88, 4.9659133], [23, 4.9334965], [83, 4.7848415], [94, 4.739018], [85, 4.717673], [75, 4.5893], [18, 4.4105506], [46, 4.3733053], [68, 4.211747]]   |
|27    |[[23, 3.6022978], [83, 3.5768886], [18, 3.549826], [75, 3.3115244], [7, 3.00511], [85, 2.8756883], [62, 2.7

In [135]:
recomendacoes_filmes = modelo.recommendForAllItems(10)

In [110]:
recomendacoes_filmes.show(10, False)

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                        |
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|31     |[[12, 3.8652148], [16, 3.039193], [8, 2.9057593], [14, 2.890551], [7, 2.8354068], [11, 2.5919433], [22, 2.5733788], [20, 2.2558298], [15, 2.2327132], [25, 1.9736233]] |
|85     |[[22, 5.8348494], [26, 4.717673], [7, 3.6553695], [16, 3.302389], [21, 2.882314], [27, 2.8756883], [6, 2.846146], [1, 2.6691706], [19, 2.5118818], [3, 2.3259466]]     |
|65     |[[23, 4.8562374], [8, 4.502869], [12, 2.9091184], [29, 2.8378484], [19, 2.6900806], [17, 2.3388376], 

In [132]:
usuarios = avaliacoes.select(als.getUserCol()).distinct()

In [114]:
usuarios.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [113]:
usuarios_recomendacoes_items = recomendacoes_usuario.select(
    recomendacoes_usuario["userId"],
    recomendacoes_usuario["recommendations"]["movieId"])

In [116]:
usuarios_recomendacoes_items.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|28    |[40, 92, 12, 49, 70, 2, 64, 41, 9, 25]  |
|26    |[22, 88, 23, 83, 94, 85, 75, 18, 46, 68]|
|27    |[23, 83, 18, 75, 7, 85, 62, 51, 27, 38] |
|12    |[96, 27, 69, 64, 71, 90, 35, 1, 47, 16] |
|22    |[73, 85, 7, 18, 74, 51, 30, 75, 23, 88] |
|1     |[55, 62, 68, 83, 49, 18, 13, 46, 52, 9] |
|13    |[39, 93, 8, 18, 83, 63, 34, 66, 92, 53] |
|6     |[25, 74, 43, 30, 80, 63, 85, 51, 2, 7]  |
|16    |[51, 29, 25, 85, 47, 5, 31, 76, 62, 96] |
|3     |[51, 88, 18, 80, 32, 10, 7, 98, 63, 22] |
+------+----------------------------------------+
only showing top 10 rows



### Recomendando 50 filmes para todos os usuários e 50 usuários para todos os filmes

In [117]:
recomendacoes_usuario = modelo.recommendForAllUsers(50)
recomendacoes_filmes = modelo.recommendForAllItems(50)

In [118]:
recomendacoes_usuario.show(50, False)

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                            

In [119]:
recomendacoes_filmes.show(50, False)

+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

### Salvando as recomendações no MongoDB

In [123]:
recomendacoes_10_filmes = modelo.recommendForAllUsers(10)
recomendacoes_10_filmes = recomendacoes_10_filmes.select(
    recomendacoes_10_filmes["userId"],
    recomendacoes_10_filmes["recommendations"]["movieId"].alias("movieId")
)

In [126]:
recomendacoes_collection = recomendacoes_10_filmes.collect()
recomendacoes_collection

[Row(userId=28, movieId=[40, 92, 12, 49, 70, 2, 64, 41, 9, 25]),
 Row(userId=26, movieId=[22, 88, 23, 83, 94, 85, 75, 18, 46, 68]),
 Row(userId=27, movieId=[23, 83, 18, 75, 7, 85, 62, 51, 27, 38]),
 Row(userId=12, movieId=[96, 27, 69, 64, 71, 90, 35, 1, 47, 16]),
 Row(userId=22, movieId=[73, 85, 7, 18, 74, 51, 30, 75, 23, 88]),
 Row(userId=1, movieId=[55, 62, 68, 83, 49, 18, 13, 46, 52, 9]),
 Row(userId=13, movieId=[39, 93, 8, 18, 83, 63, 34, 66, 92, 53]),
 Row(userId=6, movieId=[25, 74, 43, 30, 80, 63, 85, 51, 2, 7]),
 Row(userId=16, movieId=[51, 29, 25, 85, 47, 5, 31, 76, 62, 96]),
 Row(userId=3, movieId=[51, 88, 18, 80, 32, 10, 7, 98, 63, 22]),
 Row(userId=20, movieId=[46, 94, 22, 77, 23, 13, 88, 96, 59, 69]),
 Row(userId=5, movieId=[16, 49, 2, 94, 90, 46, 64, 59, 12, 68]),
 Row(userId=19, movieId=[32, 94, 51, 98, 22, 88, 7, 10, 68, 36]),
 Row(userId=15, movieId=[47, 91, 71, 69, 12, 93, 35, 1, 92, 90]),
 Row(userId=17, movieId=[46, 20, 55, 52, 68, 22, 88, 94, 18, 8]),
 Row(userId=9,

In [129]:
for row in recomendacoes_collection:
    documento = row.asDict()
    print("Salvando o documento: {}".format(documento))
    db.sugestoes.insert_one(documento)

Salvando o documento: {'userId': 28, 'movieId': [40, 92, 12, 49, 70, 2, 64, 41, 9, 25]}
Salvando o documento: {'userId': 26, 'movieId': [22, 88, 23, 83, 94, 85, 75, 18, 46, 68]}
Salvando o documento: {'userId': 27, 'movieId': [23, 83, 18, 75, 7, 85, 62, 51, 27, 38]}
Salvando o documento: {'userId': 12, 'movieId': [96, 27, 69, 64, 71, 90, 35, 1, 47, 16]}
Salvando o documento: {'userId': 22, 'movieId': [73, 85, 7, 18, 74, 51, 30, 75, 23, 88]}
Salvando o documento: {'userId': 1, 'movieId': [55, 62, 68, 83, 49, 18, 13, 46, 52, 9]}
Salvando o documento: {'userId': 13, 'movieId': [39, 93, 8, 18, 83, 63, 34, 66, 92, 53]}
Salvando o documento: {'userId': 6, 'movieId': [25, 74, 43, 30, 80, 63, 85, 51, 2, 7]}
Salvando o documento: {'userId': 16, 'movieId': [51, 29, 25, 85, 47, 5, 31, 76, 62, 96]}
Salvando o documento: {'userId': 3, 'movieId': [51, 88, 18, 80, 32, 10, 7, 98, 63, 22]}
Salvando o documento: {'userId': 20, 'movieId': [46, 94, 22, 77, 23, 13, 88, 96, 59, 69]}
Salvando o documento: {'

### Listando os documentos salvos no MongoDB

In [130]:
recomendacoes = db.sugestoes.find()

In [131]:
for recomendacao in recomendacoes:
    print(recomendacao)

{'_id': ObjectId('5f6632e8077888c3b03dbd2c'), 'userId': 28, 'movieId': [40, 92, 12, 49, 70, 2, 64, 41, 9, 25]}
{'_id': ObjectId('5f6632e8077888c3b03dbd2d'), 'userId': 26, 'movieId': [22, 88, 23, 83, 94, 85, 75, 18, 46, 68]}
{'_id': ObjectId('5f6632e8077888c3b03dbd2e'), 'userId': 27, 'movieId': [23, 83, 18, 75, 7, 85, 62, 51, 27, 38]}
{'_id': ObjectId('5f6632e8077888c3b03dbd2f'), 'userId': 12, 'movieId': [96, 27, 69, 64, 71, 90, 35, 1, 47, 16]}
{'_id': ObjectId('5f6632e8077888c3b03dbd30'), 'userId': 22, 'movieId': [73, 85, 7, 18, 74, 51, 30, 75, 23, 88]}
{'_id': ObjectId('5f6632e8077888c3b03dbd31'), 'userId': 1, 'movieId': [55, 62, 68, 83, 49, 18, 13, 46, 52, 9]}
{'_id': ObjectId('5f6632e8077888c3b03dbd32'), 'userId': 13, 'movieId': [39, 93, 8, 18, 83, 63, 34, 66, 92, 53]}
{'_id': ObjectId('5f6632e8077888c3b03dbd33'), 'userId': 6, 'movieId': [25, 74, 43, 30, 80, 63, 85, 51, 2, 7]}
{'_id': ObjectId('5f6632e8077888c3b03dbd34'), 'userId': 16, 'movieId': [51, 29, 25, 85, 47, 5, 31, 76, 62, 