In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName('recommender').getOrCreate()
df = spark.read.csv('movielens_ratings.csv', inferSchema= True, header = True)
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [3]:
df.show(3)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
+-------+------+------+
only showing top 3 rows



In [4]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [6]:
train, test = df.randomSplit([0.8, 0.2])

In [7]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [8]:
model = als.fit(train)
predictions = model.transform(test)
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26| -2.5238004|
|     31|   1.0|    27|-0.59501255|
|     31|   1.0|     4|   3.137197|
|     85|   1.0|    28| -0.1683234|
|     85|   1.0|    13|  2.2037606|
|     85|   5.0|     8|   4.343044|
|     85|   1.0|    29|  1.5260103|
|     65|   1.0|    28|  3.4493313|
|     53|   3.0|    13|   2.631197|
|     53|   1.0|    25| -2.3101962|
|     78|   1.0|    13| 0.54879403|
|     78|   1.0|    11|  0.4418241|
|     81|   5.0|    28|  0.8307642|
|     81|   1.0|     1| -1.0092545|
|     81|   1.0|     6|  2.4090357|
|     81|   1.0|    19| 0.13363218|
|     81|   1.0|    15|  0.5015665|
|     28|   1.0|    23| -0.2624761|
|     28|   1.0|     2|  1.4344041|
|     76|   1.0|     1|  1.9119977|
+-------+------+------+-----------+
only showing top 20 rows



In [9]:
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')
rmse = evaluator.evaluate(predictions)
print('RMSE:', rmse)

RMSE: 1.8124486699552562


In [14]:
this_user = test.filter(test['userId'] == 12).select('userId', 'movieId')
this_user.show()

+------+-------+
|userId|movieId|
+------+-------+
|    12|      4|
|    12|     18|
|    12|     22|
|    12|     35|
|    12|     38|
|    12|     41|
|    12|     45|
|    12|     63|
|    12|     79|
|    12|     83|
|    12|     95|
|    12|     96|
+------+-------+



In [15]:
recommendation_this_user = model.transform(this_user)
recommendation_this_user.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    12|     22| 1.6517887|
|    12|     96| 0.1308065|
|    12|     41| 1.4067035|
|    12|     35| 0.7640405|
|    12|      4|-1.1053085|
|    12|     63|  3.851338|
|    12|     45|0.70455414|
|    12|     38| 2.8361285|
|    12|     95| 0.9426958|
|    12|     83| 0.6145076|
|    12|     79| 1.3491223|
|    12|     18| -0.656619|
+------+-------+----------+



In [17]:
recommendation_this_user.orderBy('prediction', ascending=False).show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    12|     63|  3.851338|
|    12|     38| 2.8361285|
|    12|     22| 1.6517887|
|    12|     41| 1.4067035|
|    12|     79| 1.3491223|
|    12|     95| 0.9426958|
|    12|     35| 0.7640405|
|    12|     45|0.70455414|
|    12|     83| 0.6145076|
|    12|     96| 0.1308065|
|    12|     18| -0.656619|
|    12|      4|-1.1053085|
+------+-------+----------+

