# Building a Movie Recommendation System with Spark MLlib

## Import required libraries

In [1]:
import subprocess
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("bitcoins") \
    .config("spark.som.config.option", "some-value") \
    .getOrCreate()

## Download/Unzip the MovieLens 1M dataset from http://grouplens.org/datasets/movielens

In [2]:
subprocess.call(["wget", "http://files.grouplens.org/datasets/movielens/ml-1m.zip"])
subprocess.call(["unzip", "ml-1m.zip"])

1

## Read and Convert ratings data to a DataFrame

In [3]:
lines = spark.read.text("./ml-1m/ratings.dat").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

## Show the number of ratings in the dataset

In [4]:
print("Number of ratings = " + str(ratings.count()))

Number of ratings = 1000209


## Show a sample of the Ratings DataFrame

In [5]:
ratings.sample(False, 0.0001, seed=0).show(10)

+-------+------+---------+------+
|movieId|rating|timestamp|userId|
+-------+------+---------+------+
|   2908|   5.0|977895809|    68|
|   3730|   5.0|978554445|   173|
|   2917|   2.0|976301830|   456|
|    589|   4.0|976161565|   526|
|   2348|   3.0|976207524|   533|
|   1285|   4.0|979154572|   588|
|   1206|   4.0|980628867|   711|
|   3361|   4.0|975510209|   730|
|   3203|   5.0|975435824|   779|
|   1196|   4.0|975356701|   843|
+-------+------+---------+------+
only showing top 10 rows



## Show sample number of ratings per user

In [6]:
grouped_ratings = ratings.groupBy("userId").count().withColumnRenamed("count", "No. of ratings")
grouped_ratings.show(10)

+------+--------------+
|userId|No. of ratings|
+------+--------------+
|    26|           400|
|    29|           108|
|   474|           318|
|   964|            78|
|  1677|            43|
|  1697|           354|
|  1806|           214|
|  1950|           137|
|  2040|            46|
|  2214|            81|
+------+--------------+
only showing top 10 rows



## Show the number of users in the dataset

In [7]:
print("Number of users = " + str(grouped_ratings.count()))

Number of users = 6040


## Split Ratings data into Training (80%) and Test (20%) datasets

In [8]:
(training, test) = ratings.randomSplit([0.8, 0.2])


## Show resulting Ratings dataset counts

In [9]:
trainingRatio = float(training.count())/float(ratings.count())*100
testRatio = float(test.count())/float(ratings.count())*100

print("Total number of ratings = " + str(ratings.count()))
print("Training dataset count = " + str(training.count()) + ", " + str(trainingRatio) + "%")
print("Test dataset count = " + str(test.count()) + ", " + str(testRatio) + "%")

Total number of ratings = 1000209
Training dataset count = 800880, 80.07126510559293%
Test dataset count = 199329, 19.928734894407068%


## Build the recommendation model on the training data using ALS

In [10]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

## Run the model against the Test data and show a sample of the predictions

In [11]:
predictions = model.transform(test).na.drop()
predictions.show(10)

+-------+------+---------+------+----------+
|movieId|rating|timestamp|userId|prediction|
+-------+------+---------+------+----------+
|    148|   1.0|976295338|   840| 2.9349167|
|    148|   2.0|974875106|  1150| 2.9894443|
|    148|   2.0|974178993|  2456| 3.9975448|
|    463|   5.0|968916009|  3151|  3.967182|
|    463|   3.0|963746396|  4858| 2.0730953|
|    463|   4.0|973625620|  2629| 3.1774714|
|    463|   1.0|966523740|  3683| 1.1212827|
|    463|   2.0|966790403|  3562|  2.780132|
|    463|   4.0|975775726|   721| 3.3978982|
|    463|   3.0|965308300|  4252| 0.9944763|
+-------+------+---------+------+----------+
only showing top 10 rows



## Evaluate the model by computing the RMSE on the test data

In [12]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8908929362860674


## Show that a smaller value of rmse is better
This is obviously the case since RMSE is an aggregation of all the error. Thus evaluator.isLargerBetter should be 'false'.

In [13]:
evaluator.isLargerBetter()

False

## Make movie recommendations

In [14]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Show sample recommendations per user

In [15]:
userRecs.sample(False, 0.01).show(10, False)

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                               |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148   |[[1780,7.2854385], [1369,6.99533], [666,6.6703053], [2892,6.5549903], [1741,6.528875], [3523,6.07751], [572,6.003775], [2127,5.859668], [1164,5.6353364], [649,5.5918784]]    |
|5173  |[[3245,7.7563887], [1038,7.52281], [3867,7.2047706], [632,7.0838833], [37,7.0073814], [751,6.936385], [1369,6.471981], [645,6.453275], [1664,6.23118], [1543,6.188328]]       |
|5695  |[[1458,9.663776], [3855,9.074218], [3106,9.053921], [2837,9.043263], [21

# Show sample recommendations per user

In [16]:
movieRecs.sample(False, 0.01).show(10, False)

+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|3844   |[[1213,7.3201046], [2441,6.9640417], [5297,6.8789372], [2549,6.8698826], [2816,6.507644], [1971,6.458085], [2160,6.4162674], [3915,6.402381], [4544,6.17197], [2560,6.119645]] |
|1031   |[[1070,5.9382234], [4143,5.8492775], [3897,5.841146], [2755,5.6947303], [4282,5.6827908], [527,5.6089225], [1728,5.5674863], [5052,5.52997], [5983,5.419548], [1459,5.4131107]]|
|26     |[[1213,7.0531287], [2640,6.3756685], [879,6.1351347], [2502,6