## Ao longo da execucao, documente o codigo com comentarios, referentes ao seu entendimento. Utilize tambem a estrutura de markdown do Jupyter para colocar informacoes textuais relevantes.

In [1]:
#import findspark
#findspark.init()
#findspark.find()

In [2]:
from __future__ import print_function

import sys
if sys.version >= '3':
    long = int

from pyspark.sql import SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
def g(x):
    print(x)

In [4]:
working_directory = 'jars/*'

spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .config("spark.mongodb.input.uri", "mongodb://localhost:27017/puc.recomendacoes") \
        .config("spark.mongodb.output.uri", "mongodb://localhost:27017/puc.recomendacoes") \
        .config('spark.jars.packages',"org.mongodb.spark:mongo-spark-connector_2.10:2.2.1")\
        .getOrCreate()

In [5]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=long(p[3])))
ratings = spark.createDataFrame(ratingsRDD.collect())

In [6]:
ratings.show()

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
|     12|   2.0|1424380312|     0|
|     15|   1.0|1424380312|     0|
|     17|   1.0|1424380312|     0|
|     19|   1.0|1424380312|     0|
|     21|   1.0|1424380312|     0|
|     23|   1.0|1424380312|     0|
|     26|   3.0|1424380312|     0|
|     27|   1.0|1424380312|     0|
|     28|   1.0|1424380312|     0|
|     29|   1.0|1424380312|     0|
|     30|   1.0|1424380312|     0|
|     31|   1.0|1424380312|     0|
|     34|   1.0|1424380312|     0|
|     37|   1.0|1424380312|     0|
|     41|   2.0|1424380312|     0|
+-------+------+----------+------+
only showing top 20 rows



In [7]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [8]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
model = als.fit(training)

In [9]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.9371639053055516


In [10]:
userRecs = model.recommendForAllUsers(10)

In [11]:
userRecs.show(10, False)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                 |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[[55,6.844818], [9,6.5482607], [81,5.0254517], [12,4.6324606], [25,4.4356174], [2,4.3153267], [89,4.168298], [10,4.063764], [82,3.8914099], [67,3.2872562]]     |
|26    |[[7,5.067373], [94,4.9493423], [23,4.9435363], [24,4.866168], [22,4.837936], [37,4.68697], [36,4.195289], [68,4.077548], [54,4.052327], [73,3.9752367]]         |
|27    |[[46,7.7903476], [35,4.3797665], [50,4.337532], [18,4.12665], [43,3.4592], [49,3.1166081], [63,3.063826], [27,3.0246928], [83,2.9949412], [51,

In [12]:
movieRecs = model.recommendForAllItems(10)

In [13]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[12,3.7625587], ...|
|     85|[[24,5.910432], [...|
|     65|[[23,4.916393], [...|
|     53|[[8,4.918167], [2...|
|     78|[[26,1.6316882], ...|
|     34|[[25,3.0684948], ...|
|     81|[[28,5.0254517], ...|
|     28|[[25,5.014336], [...|
|     76|[[14,4.8506417], ...|
|     26|[[17,3.4877255], ...|
|     27|[[23,5.074297], [...|
|     44|[[27,2.798295], [...|
|     12|[[28,4.6324606], ...|
|     91|[[28,3.166791], [...|
|     22|[[26,4.837936], [...|
|     93|[[2,5.1972723], [...|
|     47|[[10,6.103862], [...|
|      1|[[21,1.2190033], ...|
|     52|[[4,4.000535], [2...|
|     13|[[11,4.2402472], ...|
+-------+--------------------+
only showing top 20 rows



In [14]:
users = ratings.select(als.getUserCol()).distinct()

In [15]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [17]:
#users = ratings.select(als.getUserCol()).distinct().limit(3)
#userSubsetRecs = model.recommendForUserSubset(users, 10)

In [18]:
#userSubsetRecs.select(userSubsetRecs['recommendations']).show()

In [20]:
#movies = ratings.select(als.getItemCol()).distinct().limit(3)
#movieSubSetRecs = model.recommendForItemSubset(movies, 2)

In [21]:
#movieSubSetRecs.show(1, False)

In [22]:
userRecsOnlyItemId = userRecs.select(userRecs['userId'], userRecs['recommendations']['movieId'])

In [23]:
userRecsOnlyItemId.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|28    |[55, 9, 81, 12, 25, 2, 89, 10, 82, 67]  |
|26    |[7, 94, 23, 24, 22, 37, 36, 68, 54, 73] |
|27    |[46, 35, 50, 18, 43, 49, 63, 27, 83, 51]|
|12    |[46, 17, 35, 16, 49, 94, 31, 50, 23, 40]|
|22    |[75, 88, 74, 30, 32, 51, 94, 69, 68, 52]|
|1     |[62, 68, 69, 28, 24, 29, 98, 25, 90, 85]|
|13    |[94, 75, 93, 7, 8, 53, 46, 74, 29, 18]  |
|6     |[25, 43, 58, 63, 51, 19, 85, 2, 32, 67] |
|16    |[90, 54, 85, 40, 51, 37, 47, 96, 43, 94]|
|3     |[75, 32, 51, 18, 80, 88, 30, 27, 69, 77]|
+------+----------------------------------------+
only showing top 10 rows



In [25]:
userRecs.select(userRecs["userId"], \
                userRecs["recommendations"]["movieId"].alias("movieId"),\
userRecs["recommendations"]["rating"].cast('array<double>').alias("rating")).\
    write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

## Agora faça 50 recomendacoes para todos os usuarios

## Recomende 50 usuarios para os itens

## Como poderiamos armazenar as recomendacoes no MongoDB?

## Como podemos fazer isso em Python?

## Podemos utilizar outros datasets de teste, em especifico do proprio MovieLenz? Pesquise sobre esses datasets e sua estrutura basica.

## Amazene os dados no MongoDB, criem consultas para obter as recomendações para os usuários de ID 6 e 20.