In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [2]:
lines = spark.read.text("../data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))

In [4]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [5]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [6]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.8451581297135


In [7]:
userRecs = model.recommendForAllUsers(10)

In [8]:
movieRecs = model.recommendForAllItems(10)

In [9]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [12]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
+------+



In [14]:
userSubsetRecs.first()

Row(userId=26, recommendations=[Row(movieId=92, rating=6.531030654907227), Row(movieId=25, rating=5.894122123718262), Row(movieId=70, rating=5.175421714782715), Row(movieId=12, rating=5.059426307678223), Row(movieId=51, rating=5.031729698181152), Row(movieId=37, rating=4.9537811279296875), Row(movieId=88, rating=4.953559875488281), Row(movieId=94, rating=4.952250957489014), Row(movieId=7, rating=4.947777271270752), Row(movieId=23, rating=4.944408416748047)])

In [12]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)