# Pós-Graduação - Ciência de Dados & Big Data

## Pontifícia Universidade Católica de Minas Gerais (PUC-MG)

### Aluno: Victor Hugo Negrisoli

### Sistemas de Recomendação com Spark e MongoDB

In [14]:
from __future__ import print_function
import sys
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pymongo
from pymongo import MongoClient
import json
from bson.objectid import ObjectId

In [15]:
host = 'localhost'
port = 27017
client = pymongo.MongoClient(host, port)
db = client.pos

In [16]:
def converter_long_para_int_python3():
    if sys.version >= '3':
        long = int

In [23]:
def remover_collection_se_existir():
    for i in db.suggestions.find():
        db.suggestions.drop()
        print('Collection removida!')
        break

In [24]:
def definir_configuracoes_iniciais():
    converter_long_para_int_python3()
    remover_collection_se_existir()

In [25]:
definir_configuracoes_iniciais()

In [26]:
def criar_sessao_spark():
    return SparkSession\
        .builder\
        .appName("Pratica ALS")\
        .getOrCreate()

In [27]:
spark = criar_sessao_spark()

In [37]:
caminho_arquivo = "dados/sample_movielens_ratings.txt"
linhas = spark.read.text(caminho_arquivo).rdd
particoes = linhas.map(lambda linha: linha.value.split("::"))

avaliacoesRDD = particoes.map(lambda particao: Row(
    userId = int(particao[0]),
    movieId = int(particao[1]),
    ratingId = int(particao[2]),
    timestampId = long(particao[3])
))


In [40]:
avaliacoes = spark.createDataFrame(avaliacoesRDD)
avaliacoes.head(10)

[Row(userId=0, movieId=2, ratingId=3, timestampId=1424380312),
 Row(userId=0, movieId=3, ratingId=1, timestampId=1424380312),
 Row(userId=0, movieId=5, ratingId=2, timestampId=1424380312),
 Row(userId=0, movieId=9, ratingId=4, timestampId=1424380312),
 Row(userId=0, movieId=11, ratingId=1, timestampId=1424380312),
 Row(userId=0, movieId=12, ratingId=2, timestampId=1424380312),
 Row(userId=0, movieId=15, ratingId=1, timestampId=1424380312),
 Row(userId=0, movieId=17, ratingId=1, timestampId=1424380312),
 Row(userId=0, movieId=19, ratingId=1, timestampId=1424380312),
 Row(userId=0, movieId=21, ratingId=1, timestampId=1424380312)]

In [42]:
avaliacoes.show(10)

+------+-------+--------+-----------+
|userId|movieId|ratingId|timestampId|
+------+-------+--------+-----------+
|     0|      2|       3| 1424380312|
|     0|      3|       1| 1424380312|
|     0|      5|       2| 1424380312|
|     0|      9|       4| 1424380312|
|     0|     11|       1| 1424380312|
|     0|     12|       2| 1424380312|
|     0|     15|       1| 1424380312|
|     0|     17|       1| 1424380312|
|     0|     19|       1| 1424380312|
|     0|     21|       1| 1424380312|
+------+-------+--------+-----------+
only showing top 10 rows



In [45]:
dataset_treino_oitenta_por_cento = 0.8
dataset_teste_vinte_por_cento = 0.2

lista_treino_teste = [dataset_treino_oitenta_por_cento, dataset_teste_vinte_por_cento]

(treino, teste) = avaliacoes.randomSplit(lista_treino_teste)

In [46]:
treino.show(10)

+------+-------+--------+-----------+
|userId|movieId|ratingId|timestampId|
+------+-------+--------+-----------+
|     0|      3|       1| 1424380312|
|     0|      9|       4| 1424380312|
|     0|     11|       1| 1424380312|
|     0|     15|       1| 1424380312|
|     0|     17|       1| 1424380312|
|     0|     19|       1| 1424380312|
|     0|     21|       1| 1424380312|
|     0|     23|       1| 1424380312|
|     0|     27|       1| 1424380312|
|     0|     28|       1| 1424380312|
+------+-------+--------+-----------+
only showing top 10 rows



In [47]:
teste.show(10)

+------+-------+--------+-----------+
|userId|movieId|ratingId|timestampId|
+------+-------+--------+-----------+
|     0|      2|       3| 1424380312|
|     0|      5|       2| 1424380312|
|     0|     12|       2| 1424380312|
|     0|     26|       3| 1424380312|
|     0|     29|       1| 1424380312|
|     0|     31|       1| 1424380312|
|     0|     37|       1| 1424380312|
|     0|     50|       1| 1424380312|
|     0|     51|       1| 1424380312|
|     0|     59|       2| 1424380312|
+------+-------+--------+-----------+
only showing top 10 rows

