# Collaborative Filtering with ALS using PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.sql.functions import explode, col

import codecs
import sys

#### Creating SparkSession

In [3]:
spark = SparkSession.builder.appName("ALSMovieRecNotebook").getOrCreate()

In [4]:
ratingsSchema = StructType([
    StructField('userId', IntegerType(), True),
    StructField('movieId', IntegerType(), True),
    StructField('rating', IntegerType(), True),
    StructField('timeStamp', LongType(), True)
])

In [5]:
data = spark.read.option('sep', '::').schema(ratingsSchema).csv('ml-1m/ratings.dat')
data.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timeStamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
+------+-------+------+---------+
only showing top 5 rows



In [6]:
movieSchema = StructType([
    StructField('movieId', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('genres', StringType(), True)
])

In [7]:
movies = spark.read.option('sep', '::').schema(movieSchema).csv('ml-1m/movies.dat')
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



#### Train/Test Split

In [8]:
# Splitting data into training and test set
(train, test) = data.randomSplit([0.7, 0.3], seed = 42)

#### ALS Model

In [9]:
# Setting up ALS Model
USERID = 'userId'
MOVIEID = 'movieId'
RATING = 'rating'

als = ALS(
    maxIter = 5,
    regParam = 0.01,
    nonnegative = True,
    implicitPrefs = False,
    userCol = USERID,
    itemCol = MOVIEID,
    ratingCol = RATING
)

#### Hyperparameter Tuning

In [10]:
# Hyperparameter Tuning
param_grid = ParamGridBuilder().addGrid(als.rank, [10,50,100,150]).addGrid(als.regParam, [0.01, 0.05, 0.1, 0.15]).build()

In [11]:
# Evaluator
eval = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

#### Cross Validation

In [12]:
# Cross Validation
cross_val = CrossValidator(estimator = als, estimatorParamMaps = param_grid, evaluator = eval, numFolds = 5)

In [13]:
model = cross_val.fit(train)

In [14]:
best_model = model.bestModel

In [15]:
# view the parameters of the best model
print('Rank:', best_model._java_obj.parent().getRank())
print('Reg Param:', best_model._java_obj.parent().getRegParam())
print('Max Iteration:', best_model._java_obj.parent().getMaxIter())

Rank: 10
Reg Param: 0.01
Max Iteration: 5


#### Predictions on Test Set

In [16]:
# Get predictions for the test data using the best model
predictions = best_model.transform(test)
predictions = predictions.na.drop()

#### Model Evaluation

In [17]:
# Evaluate Performance of model using RMSE
rmse = eval.evaluate(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.8906926686566768


#### Making Recommendations

In [30]:
# Making recommendations for users
num_recommendations = 10
userID = 1
userSchema = StructType(
    [StructField('userId', IntegerType(), True)]
)
users = spark.createDataFrame([[userID,]], userSchema)

recommendations = best_model.recommendForUserSubset(users, num_recommendations)



In [31]:
recommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3880, 10.376456...|
+------+--------------------+



In [32]:
recommendations = recommendations.withColumn('rec_explode', explode('recommendations'))\
    .select('userId', col('rec_explode.movieId'), col('rec_explode.rating'))

recommendations.show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|     1|   3880|10.376456|
|     1|   2773|10.375473|
|     1|   1696|10.371819|
|     1|    682|10.186635|
|     1|   2209| 9.311568|
|     1|   1930| 9.137518|
|     1|   2963| 8.924449|
|     1|   2197| 8.831035|
|     1|   2998| 8.741822|
|     1|    108| 8.638776|
+------+-------+---------+



In [33]:
movies.show(1)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Animation|Childre...|
+-------+----------------+--------------------+
only showing top 1 row



In [34]:
recommendations.join(movies, on = 'movieId').show()

+-------+------+---------+--------------------+-----------------+
|movieId|userId|   rating|               title|           genres|
+-------+------+---------+--------------------+-----------------+
|   3880|     1|10.376456|Ballad of Ramblin...|      Documentary|
|   2773|     1|10.375473|Alice and Martin ...|            Drama|
|   1696|     1|10.371819|         Bent (1997)|        Drama|War|
|    682|     1|10.186635|Tigrero: A Film T...|Documentary|Drama|
|   2209|     1| 9.311568|Young and Innocen...|   Crime|Thriller|
|   1930|     1| 9.137518|    Cavalcade (1933)|            Drama|
|   2963|     1| 8.924449| Joe the King (1999)|      Crime|Drama|
|   2197|     1| 8.831035|    Firelight (1997)|            Drama|
|   2998|     1| 8.741822|Dreaming of Josep...|          Romance|
|    108|     1| 8.638776|      Catwalk (1995)|      Documentary|
+-------+------+---------+--------------------+-----------------+



In [35]:
data.join(movies, on = 'movieId').filter('userId = 1').sort('rating', ascending = False).show(10)

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timeStamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|   1193|     1|     5|978300760|One Flew Over the...|               Drama|
|   2355|     1|     5|978824291|Bug's Life, A (1998)|Animation|Childre...|
|   1287|     1|     5|978302039|      Ben-Hur (1959)|Action|Adventure|...|
|   2804|     1|     5|978300719|Christmas Story, ...|        Comedy|Drama|
|    595|     1|     5|978824268|Beauty and the Be...|Animation|Childre...|
|     48|     1|     5|978824351|   Pocahontas (1995)|Animation|Childre...|
|   1035|     1|     5|978301753|Sound of Music, T...|             Musical|
|   3105|     1|     5|978301713|   Awakenings (1990)|               Drama|
|   1270|     1|     5|978300055|Back to the Futur...|       Comedy|Sci-Fi|
|    527|     1|     5|978824195|Schindler's List ...|           Drama|War|
+-------+---