In [1]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
	.master('local[*]') \
    .appName("Load and Query CSV with SQL") \
    .getOrCreate()

In [2]:
ratings = spark.read.csv("dataset/ratings.csv",sep=',', header=True, inferSchema=True, nullValue='NA') 
movies = spark.read.csv("dataset/movies.csv",sep=',', header=True, inferSchema=True, nullValue='NA') 
tags = spark.read.csv("dataset/tags.csv",sep=',', header=True, inferSchema=True, nullValue='NA') 


# Viewing the MovieLens Data

Familiarize yourself with the ratings dataset provided here. Would you consider the data to be implicit or explicit ratings?

In [3]:
# Look at the column names
print(ratings.columns)

# Look at the first few rows of data
print(ratings.show(3))

['userId', 'movieId', 'rating', 'timestamp']
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows

None


# Calculate sparsity

As you know, ALS works well with sparse datasets. Let's see how much of the ratings matrix is actually empty.

Remember that sparsity is calculated by the number of cells in a matrix that contain a rating divided by the total number of values that matrix could hold given the number of users and items (movies). In other words, dividing the number of ratings present in the matrix by the product of users and movies in the matrix and subtracting that from 1 will give us the sparsity or the percentage of the ratings matrix that is empty.

In [4]:
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()

# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  98.30% empty.


# The GroupBy and Filter methods

Now that we know a little more about the dataset, let's look at some general summary metrics of the ratings dataset and see how many ratings the movies have and how many ratings each users has provided.

In [5]:
# Import the requisite packages
from pyspark.sql.functions import col

# View the ratings dataset
ratings.show(3)

# Filter to show only userIds less than 100
ratings.filter(col("userId") < 100).show(3)

# Group data by userId, count ratings
ratings.groupBy("userId").count().show(3)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows

+------+-----+
|userId|count|
+------+-----+
|   148|   48|
|   463|   33|
|   471|   28|
+------+-----+
only showing top 3 rows



# MovieLens Summary Statistics

Let's take the groupBy() method a bit further.

Once you've applied the .groupBy() method to a dataframe, you can subsequently run aggregate functions such as .sum(), .avg(), .min() and have the results grouped.

In [6]:
from pyspark.sql.functions import min, avg
# Min num ratings for movies
print("Movie with the fewest ratings: ")
ratings.groupBy("movieId").count().select(min("count")).show()

# Avg num ratings per movie
print("Avg num ratings per movie: ")
ratings.groupBy("movieId").count().select(avg("count")).show()

# Min num ratings for user
print("User with the fewest ratings: ")
ratings.groupBy("userId").count().select(min("count")).show()

# Avg num ratings per users
print("Avg num ratings per user: ")
ratings.groupBy("userId").count().select(avg("count")).show()

Movie with the fewest ratings: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Avg num ratings per movie: 
+------------------+
|        avg(count)|
+------------------+
|10.369806663924312|
+------------------+

User with the fewest ratings: 
+----------+
|min(count)|
+----------+
|        20|
+----------+

Avg num ratings per user: 
+------------------+
|        avg(count)|
+------------------+
|165.30491803278687|
+------------------+



# View Schema

As you know from previous chapters, Spark's implementation of ALS requires that movieIds and userIds be provided as integer datatypes. Many datasets need to be prepared accordingly in order for them to function properly with Spark. A common issue is that Spark thinks numbers are strings, and vice versa.

In [7]:
# Use .printSchema() to see the datatypes of the ratings dataset
ratings.printSchema()



root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [8]:
# Tell Spark to convert the columns to the proper data types
ratings = ratings.select(ratings.userId.cast("integer"), ratings.movieId.cast("integer"), ratings.rating.cast("double"))

# Call .printSchema() again to confirm the columns are now in the correct format
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



# Create test/train splits and build your ALS model

You already know how to build an ALS model, having done it in the previous chapter. We will do that again, but we'll take some additional steps to fully build out a cross-validated model.

In [9]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False)

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

# Tell Spark how to tune your ALS model

Now we'll need to create a ParamGrid to tell Spark what hyperparameters we want it to tune, how to tune them, and then build out an evaluator so Spark can know how to measure the algorithm's performance.

In [10]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10]) \
            .addGrid(als.maxIter, [5] ) \
            .addGrid(als.regParam, [.05]  ) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  1


# Build your cross validation pipeline

Now that we have our data, our train/test splits, our model, and our hyperparameter values, let's tell Spark how to cross validate our model so it can find the best combination of hyperparameters and return it to us.

In [11]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_f15967e4e861


# Best Model and Best Model Parameters

Now that we have our cross validator, cv, built out, we can tell Spark to take our data, fit the ALS algorithm to it, and try the different combinations of hyperparameter values from our param_grid so that it can identify what values provide the smallest RMSE. 

In [12]:
model = cv.fit(train)
prediction = model.transform(test)
best_model = model.bestModel
# Print best_model
print(model.bestModel)

ALSModel: uid=ALS_10e5d64aca8b, rank=10


In [24]:
# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# Print "Rank"
print("  Rank:", best_model.rank)

# Print "MaxIter"
# print("  MaxIter:", best_model.getMaxIter()) 

# Print "RegParam"
# print("  RegParam:", best_model.getRegParam())

**Best Model**
  Rank: 10


# Generate predictions and calculate RMSE

Now that we have a model that is trained on our data and tuned through cross validation, we can see how it performs on the test dataframe. To do this, we'll calculate the RMSE.

In [25]:
# View the predictions 
prediction.show(3)

# Calculate and print the RMSE of test_predictions
RMSE = evaluator.evaluate(prediction)
print(RMSE)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|      3|   4.0|  4.068782|
|     1|     50|   5.0|  4.923078|
|     1|    362|   5.0|  3.866249|
+------+-------+------+----------+
only showing top 3 rows

nan


# Interpreting the RMSE

This model was able to achieve an RMSE of 0.633. Click on the best interpretation of what this means.

- An RMSE of 0.633 means that on average the model predicts 0.633 above or below values of the original ratings matrix.

# Do recommendations make sense

Now that we have an understanding of how well our model performed, and have some confidence that it will provide recommendations that are relevant to users, let's actually look at recommendations made to a user and see if they make sense.

In [27]:
# Look at user 60's ratings
print("User 60's Ratings:")
ratings.filter(col("userId") == 60).sort("rating", ascending = False).show(3)

# Look at the movies recommended to user 60
print("User 60s Recommendations:")
prediction.filter(col("userId") == 60).show(3)

# Look at user 63's ratings
print("User 63's Ratings:")
ratings.filter(col("userId") == 63).sort("rating", ascending = False).show(3)

# Look at the movies recommended to user 63
print("User 63's Recommendations:")
prediction.filter(col("userId") == 63).show(3)

User 60's Ratings:
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    60|    527|   5.0|
|    60|    858|   5.0|
|    60|  58559|   5.0|
+------+-------+------+
only showing top 3 rows

User 60s Recommendations:
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    60|    318|   4.0|  4.246172|
|    60|    362|   4.0| 4.0332055|
|    60|   1562|   3.0| 2.4011354|
+------+-------+------+----------+

User 63's Ratings:
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    63|    296|   5.0|
|    63|      1|   5.0|
|    63|    318|   5.0|
+------+-------+------+
only showing top 3 rows

User 63's Recommendations:
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    63|     34|   3.0| 3.7479832|
|    63|    344|   5.0|  2.688593|
|    63|    356|   3.5| 3.9641566|
+------+-------+------+----------+
only showing top 3 rows



In [36]:
userRecs = best_model.recommendForAllUsers(5)
userRecs.show(3)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{8477, 6.528834}...|
|     2|[{1241, 5.9868026...|
|     3|[{6835, 5.1822457...|
+------+--------------------+
only showing top 3 rows



In [70]:
from pyspark.sql.functions import explode
# Explode recommendations column to separate itemId and prediction
recommendations_df = userRecs.withColumn("movieId_rating", explode("recommendations")) 
recommendations_df = recommendations_df.withColumn("movieId", col("movieId_rating").getField("movieId"))\
                .withColumn("rating", col("movieId_rating").getField("rating")) # Separate item
recommendations_df.show()
recommendations_df.dtypes


+------+--------------------+------------------+-------+---------+
|userId|     recommendations|    movieId_rating|movieId|   rating|
+------+--------------------+------------------+-------+---------+
|     1|[{8477, 6.528834}...|  {8477, 6.528834}|   8477| 6.528834|
|     1|[{8477, 6.528834}...|{92535, 5.9355555}|  92535|5.9355555|
|     1|[{8477, 6.528834}...|{86377, 5.8888526}|  86377|5.8888526|
|     1|[{8477, 6.528834}...|{27156, 5.8701963}|  27156|5.8701963|
|     1|[{8477, 6.528834}...|   {28, 5.8553843}|     28|5.8553843|
|     2|[{1241, 5.9868026...| {1241, 5.9868026}|   1241|5.9868026|
|     2|[{1241, 5.9868026...|{72171, 5.8943777}|  72171|5.8943777|
|     2|[{1241, 5.9868026...|{157775, 5.516651}| 157775| 5.516651|
|     2|[{1241, 5.9868026...|{156025, 5.516651}| 156025| 5.516651|
|     2|[{1241, 5.9868026...|{150554, 5.516651}| 150554| 5.516651|
|     3|[{6835, 5.1822457...| {6835, 5.1822457}|   6835|5.1822457|
|     3|[{6835, 5.1822457...| {5746, 5.1822457}|   5746|5.1822

[('userId', 'int'),
 ('recommendations', 'array<struct<movieId:int,rating:float>>'),
 ('movieId_rating', 'struct<movieId:int,rating:float>'),
 ('movieId', 'int'),
 ('rating', 'float')]