# ALSによる協調フィルタリング

https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [2]:
lines = spark.read.text("/data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [3]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [4]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6732488616710943


In [5]:
test.select("userId", "movieId", "rating").where("userId=19").orderBy("movieId").show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    19|      0|   1.0|
|    19|      2|   1.0|
|    19|     14|   1.0|
|    19|     31|   1.0|
|    19|     41|   1.0|
|    19|     43|   1.0|
|    19|     45|   1.0|
|    19|     53|   2.0|
|    19|     58|   1.0|
|    19|     64|   1.0|
|    19|     90|   4.0|
+------+-------+------+



In [6]:
predictions.select("userId", "movieId", "rating").where("userId=19").orderBy("movieId").show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    19|      0|   1.0|
|    19|      2|   1.0|
|    19|     14|   1.0|
|    19|     31|   1.0|
|    19|     41|   1.0|
|    19|     43|   1.0|
|    19|     45|   1.0|
|    19|     53|   2.0|
|    19|     58|   1.0|
|    19|     64|   1.0|
|    19|     90|   4.0|
+------+-------+------+

