In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.sql.functions import explode, col

import codecs
import sys

In [3]:
spark = SparkSession.builder.appName("ALSMovieRecNotebook").getOrCreate()

In [4]:
ratingsSchema = StructType([
    StructField('userId', IntegerType(), True),
    StructField('movieId', IntegerType(), True),
    StructField('rating', IntegerType(), True),
    StructField('timeStamp', LongType(), True)
])

In [5]:
data = spark.read.option('sep', '::').schema(ratingsSchema).csv('ml-1m/ratings.dat')
data.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timeStamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
+------+-------+------+---------+
only showing top 5 rows



In [6]:
movieSchema = StructType([
    StructField('movieId', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('genres', StringType(), True)
])

In [7]:
movies = spark.read.option('sep', '::').schema(movieSchema).csv('ml-1m/movies.dat')
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [8]:
# Splitting data into training and test set
(train, test) = data.randomSplit([0.7, 0.3], seed = 42)

In [9]:
# Setting up ALS Model
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'

als = ALS(
    maxIter = 5,
    regParam = 0.01,
    nonnegative = True,
    implicitPrefs = False,
    userCol = USERID,
    itemCol = MOVIEID,
    ratingCol = RATING
)

In [10]:
# Hyperparameter Tuning
param_grid = ParamGridBuilder().addGrid(als.rank, [10,50,100,150]).addGrid(als.regParam, [0.01, 0.05, 0.1, 0.15]).build()

In [11]:
# Evaluator
eval = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

In [12]:
# Cross Validation
cross_val = CrossValidator(estimator = als, estimatorParamMaps = param_grid, evaluator = eval, numFolds = 5)

In [13]:
model = cross_val.fit(train)

In [14]:
best_model = model.bestModel

In [15]:
# view the parameters of the best model
print('Rank:', best_model._java_obj.parent().getRank())
print('Reg Param:', best_model._java_obj.parent().getRegParam())
print('Max Iteration:', best_model._java_obj.parent().getMaxIter())

Rank: 10
Reg Param: 0.01
Max Iteration: 5


In [16]:
# Get predictions for the test data using the best model
predictions = best_model.transform(test)
predictions = predictions.na.drop()

In [17]:
# Evaluate Performance of model using RMSE
rmse = eval.evaluate(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.8906926686566768


In [24]:
# Making recommendations for users
num_recommendations = 10
userID = 21
userSchema = StructType(
    [StructField('userId', IntegerType(), True)]
)
users = spark.createDataFrame([[userID,]], userSchema)

recommendations = best_model.recommendForUserSubset(users, num_recommendations)



In [25]:
recommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    21|[{1575, 11.443403...|
+------+--------------------+



In [26]:
recommendations = recommendations.withColumn('rec_explode', explode('recommendations'))\
    .select('userId', col('rec_explode.movieId'), col('rec_explode.rating'))

recommendations.show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|    21|   1575|11.443403|
|    21|   2627|11.159713|
|    21|   2913|10.363252|
|    21|   3050|9.4365425|
|    21|    682| 9.299013|
|    21|    309| 9.141784|
|    21|   3816| 9.130868|
|    21|   1773| 9.068056|
|    21|   2998| 8.974762|
|    21|    718| 8.607745|
+------+-------+---------+



In [27]:
movies.show(1)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Animation|Childre...|
+-------+----------------+--------------------+
only showing top 1 row



In [28]:
recommendations.join(movies, on = 'movieId').show()

+-------+------+---------+--------------------+-----------------+
|movieId|userId|   rating|               title|           genres|
+-------+------+---------+--------------------+-----------------+
|   1575|    21|11.443403|       Gabbeh (1996)|            Drama|
|   2627|    21|11.159713|    Endurance (1998)|Documentary|Drama|
|   2913|    21|10.363252|Mating Habits of ...|           Comedy|
|   3050|    21|9.4365425|  Light It Up (1999)|            Drama|
|    682|    21| 9.299013|Tigrero: A Film T...|Documentary|Drama|
|    309|    21| 9.141784|Red Firecracker, ...|            Drama|
|   3816|    21| 9.130868|Official Story, T...|            Drama|
|   1773|    21| 9.068056|   Tokyo Fist (1995)|     Action|Drama|
|   2998|    21| 8.974762|Dreaming of Josep...|          Romance|
|    718|    21| 8.607745|Visitors, The (Le...|    Comedy|Sci-Fi|
+-------+------+---------+--------------------+-----------------+



In [29]:
data.join(movies, on = 'movieId').filter('userId = 1').sort('rating', ascending = False).show(5)

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timeStamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|   1035|     1|     5|978301753|Sound of Music, T...|             Musical|
|   1836|     1|     5|978300172|Last Days of Disc...|               Drama|
|   3105|     1|     5|978301713|   Awakenings (1990)|               Drama|
|   2355|     1|     5|978824291|Bug's Life, A (1998)|Animation|Childre...|
|   1270|     1|     5|978300055|Back to the Futur...|       Comedy|Sci-Fi|
+-------+------+------+---------+--------------------+--------------------+
only showing top 5 rows

