In [1]:
import findspark
findspark.init('/home/mint/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec').getOrCreate()
data = spark.read.csv('movielens_ratings.csv', inferSchema=True, header=True)
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
train_data, test_data = data.randomSplit([0.7, 0.3])
model = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating').fit(train_data)
predictions = model.transform(test_data)
predictions.show()

+-------+------+------+------------+
|movieId|rating|userId|  prediction|
+-------+------+------+------------+
|     85|   1.0|    12|   -0.506113|
|     85|   3.0|     6|   2.4651365|
|     85|   1.0|     4| -0.39240158|
|     85|   1.0|    23|   2.5723114|
|     85|   4.0|     7|  -2.5198703|
|     85|   1.0|    25|   2.1131983|
|     85|   3.0|    21|   2.7996652|
|     65|   1.0|    22|   2.1855187|
|     65|   1.0|    16|    2.048207|
|     65|   5.0|    23|   3.8772216|
|     65|   1.0|    24| -0.30137157|
|     65|   1.0|     2| -0.40656617|
|     53|   1.0|    12|   0.6356875|
|     53|   5.0|     8|   6.2539587|
|     53|   1.0|    25|   1.7201474|
|     53|   3.0|    14|-0.064964175|
|     78|   1.0|     1|   1.0734012|
|     78|   1.0|    20|   1.2275665|
|     78|   1.0|    24| -0.22141272|
|     34|   1.0|    16|   3.4690292|
+-------+------+------+------------+
only showing top 20 rows



In [4]:
print('Testing RMSE:')
print(RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction').evaluate(predictions))

Testing RMSE
2.2015891853360703


In [9]:
single_user = test_data.filter(test_data['userId']==11).select(['movieId', 'userId'])
model.transform(single_user).orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     80|    11|  4.0428543|
|     88|    11|   3.930434|
|     18|    11|  3.7125754|
|     77|    11|   3.547746|
|     38|    11|  2.5194998|
|     32|    11|  2.4223847|
|      9|    11|   2.327965|
|     41|    11|  1.8651865|
|     81|    11|  1.7083263|
|     21|    11|  1.6200585|
|     97|    11|  1.4544152|
|     79|    11|  0.9570358|
|     86|    11| 0.21667144|
|     62|    11|-0.29146147|
|      0|    11|-0.31706208|
|     82|    11| -0.9322159|
|     64|    11| -1.0584009|
|     48|    11| -3.2417574|
+-------+------+-----------+

