In [1]:
%scala
import org.apache.spark.ml.recommendation.ALS
val ratings = spark.read.textFile("/FileStore/tables/0y03ep1o1497652390466/sample.txt")
.selectExpr("split(value , '::') as col")
.selectExpr(
"cast(col[0] as int) as userId",
"cast(col[1] as int) as movieId",
"cast(col[1] as float) as rating",
"cast(col[1] as long) as timestamp")
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
val als = new ALS()
.setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
val alsModel = als.fit(training)
val predictions = alsModel.transform(test)

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
ratings = spark.read.text("/FileStore/tables/0y03ep1o1497652390466/sample.txt")\
.rdd.toDF()\
.selectExpr("split(value , '::') as col")\
.selectExpr(
"cast(col[0] as int) as userId",
"cast(col[1] as int) as movieId",
"cast(col[1] as float) as rating",
"cast(col[1] as long) as timestamp")
training, test = ratings.randomSplit([0.8, 0.2])
als = ALS()\
.setMaxIter(5)\
.setRegParam(0.01)\
.setUserCol("userId")\
.setItemCol("movieId")\
.setRatingCol("rating")
alsModel = als.fit(training)
predictions = alsModel.transform(test)

In [3]:
%scala
import org.apache.spark.ml.evaluation.RegressionEvaluator
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error = $rmse")

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()\
.setMetricName("rmse")\
.setLabelCol("rating")\
.setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)

In [5]:
%scala
import org.apache.spark.mllib.evaluation.{
RankingMetrics,
RegressionMetrics}
val regComparison = predictions.select("rating", "prediction")
.rdd
.map(x => (
x(0).asInstanceOf[Float].toDouble,
x(1).asInstanceOf[Float].toDouble))
val metrics = new RegressionMetrics(regComparison)

In [6]:
from pyspark.mllib.evaluation import RegressionMetrics
regComparison = predictions.select("rating", "prediction")\
.rdd\
.map(lambda x: (float(x(0)), float(x(1))))
metrics = RegressionMetrics(regComparison)

In [7]:
%scala
import org.apache.spark.mllib.evaluation.{RankingMetrics, RegressionMetrics}
import org.apache.spark.sql.functions.{col, expr}
val perUserActual = predictions
.where("rating > 2.5")
.groupBy("userId")
.agg(expr("collect_set(movieId) as movies"))


In [8]:
from pyspark.mllib.evaluation import RankingMetrics, RegressionMetrics
from pyspark.sql.functions import col, expr
perUserActual = predictions\
.where("rating > 2.5")\
.groupBy("userId")\
.agg(expr("collect_set(movieId) as movies"))

In [9]:
%scala
val perUserPredictions = predictions
.orderBy(col("userId"), col("prediction").desc)
.groupBy("userId")
.agg(expr("collect_list(movieId) as movies"))

In [10]:
perUserPredictions = predictions\
.orderBy(col("userId"), expr("prediction DESC"))\
.groupBy("userId")\
.agg(expr("collect_list(movieId) as movies"))

In [11]:
%scala
val perUserActualvPred = perUserActual.join(perUserPredictions, Seq("userId"))
.map(row => (
row(1).asInstanceOf[Seq[Integer]].toArray,
row(2).asInstanceOf[Seq[Integer]].toArray.take(15)
))
val ranks = new RankingMetrics(perUserActualvPred.rdd)

In [12]:
perUserActualvPred = perUserActual.join(perUserPredictions, ["userId"]).rdd\
.map(lambda row: (row[1], row[2][:15]))
ranks = RankingMetrics(perUserActualvPred)

In [13]:
%scala
ranks.meanAveragePrecision
ranks.precisionAt(5)

In [14]:
ranks.meanAveragePrecision
ranks.precisionAt(2)