In [1]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
	.master('local[*]') \
    .appName("Load and Query CSV with SQL") \
    .getOrCreate()

In [2]:
msd = spark.read.csv("dataset/kaggle_visible_evaluation_triplets.txt", inferSchema=True, sep='\t')
msd = msd.withColumnRenamed("_c0", "user")\
        .withColumnRenamed("_c1", "song")\
        .withColumnRenamed("_c2", "plays")
msd.show(3)
msd.dtypes

+--------------------+------------------+-----+
|                user|              song|plays|
+--------------------+------------------+-----+
|fd50c4007b68a3737...|SOBONKR12A58A7A7E0|    1|
|fd50c4007b68a3737...|SOEGIYH12A6D4FC0E3|    1|
|fd50c4007b68a3737...|SOFLJQZ12A6D4FADA6|    1|
+--------------------+------------------+-----+
only showing top 3 rows



[('user', 'string'), ('song', 'string'), ('plays', 'int')]

In [3]:
from pyspark.sql.functions import monotonically_increasing_id
users = msd.select("user").distinct().coalesce(1) # coalesce to put them in a single partition for consistent increase of id
users = users.withColumn("userId", monotonically_increasing_id()).persist() # caching to make sure the values do not change
songs = msd.select("song").distinct().coalesce(1) # coalesce to put them in a single partition for consistent increase of id
items = songs.withColumn("songId", monotonically_increasing_id()).persist() # caching to make sure the values do not change
msd = msd.join(users, "user", "left").join(items, "song", "left") # If rating dataframe already exists


# Confirm understanding of implicit rating concepts

What is the difference between "implicit" ratings and "explicit" ratings?

- Explicit ratings are values that users have given to explicitly rate their preferences. Implicit ratings are "implied" from user behavior.

# MSD summary statistics

Let's get familiar with the Million Songs Echo Nest Taste Profile data subset. For purposes of this course, we'll just call it the Million Songs dataset or msd. Let's get the number of users and the number of songs. Let's also see which songs have the most plays from this subset.

In [4]:
msd = msd.sample(fraction=0.01, seed=42).distinct()

In [5]:
# Look at the data
msd.show(3)

# Count the number of distinct userIds
user_count = msd.select("userId").distinct().count()
print("Number of users: ", user_count)

# Count the number of distinct songIds
song_count = msd.select("songId").distinct().count()
print("Number of songs: ", song_count)

+------------------+--------------------+-----+------+------+
|              song|                user|plays|userId|songId|
+------------------+--------------------+-----+------+------+
|SOSASFL12A6D4F7B02|b815763c18263b545...|    3| 23896|     0|
|SOUZDNN12A6701C563|0e041cb084b8da194...|    5| 24932|   264|
|SOPZXJL12A6310D817|8d051ab32db6e1da1...|    1| 45709|   252|
+------------------+--------------------+-----+------+------+
only showing top 3 rows

Number of users:  13443
Number of songs:  10251


Grouped summary statistics

In this exercise, we are going to combine the .groupBy() and .filter() methods that you've used previously to calculate the min() and avg() number of users that have rated each song, and the min() and avg() number of songs that each user has rated.

In [6]:
msd = msd.withColumnRenamed("plays", "num_plays")


In [7]:
from pyspark.sql.functions import col,avg, min, max
# Min num implicit ratings for a song
print("Minimum implicit ratings for a song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(min("count")).show()

# Avg num implicit ratings per songs
print("Average implicit ratings per song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(avg("count")).show()

# Min num implicit ratings from a user
print("Minimum implicit ratings from a user: ")
msd.filter(col("num_plays") > 0).groupBy("userId").count().select(min("count")).show()

# Avg num implicit ratings for users
print("Average implicit ratings per user: ")
msd.filter(col("num_plays") > 0).groupBy("userId").count().select(avg("count")).show()

Minimum implicit ratings for a song: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per song: 
+------------------+
|        avg(count)|
+------------------+
|1.4247390498487953|
+------------------+

Minimum implicit ratings from a user: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per user: 
+------------------+
|        avg(count)|
+------------------+
|1.0864390389050063|
+------------------+



# Add zeros

Many recommendation engines use implicit ratings. In many cases these datasets don't include behavior counts for items that a user has never purchased. In these cases, you'll need to add them and include zeros.

In [9]:
# Z = msd.withColumnRenamed("songId", "productId")
# # View the data
# Z.show()

# # Extract distinct userIds and productIds
# users = Z.select("userId").distinct()
# products = Z.select("productId").distinct()

# # Cross join users and products
# cj = users.crossJoin(products)

# # Join cj and Z
# Z_expanded = cj.join(Z, ["userId", "productId"], "left").fillna(0)

# # View Z_expanded
# Z_expanded.show()

# Specify ALS hyperparameters

You're now going to build your first implicit rating recommendation engine using ALS. To do this, you will first tell Spark what values you want it to try when finding the best model.

In [10]:
# Complete the lists below
ranks = [10, 20, 30, 40]
maxIters = [10, 20, 30, 40]
regParams = [.05, .1, .15]
alphas = [20, 40, 60, 80]

# Build implicit models

Now that you have all of your hyperparameter values specified, let's have Spark build enough models to test each combination. 

In [11]:
from pyspark.ml.recommendation import ALS
model_list = []
# For loop will automatically create and store ALS models
for r in ranks:
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                model_list.append(ALS(userCol= "userId", itemCol= "songId", ratingCol= "num_plays", rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))

# Validate
len(model_list) == (len(ranks)*len(maxIters)*len(regParams)*len(alphas))

[ALS_874a3d2d3102, ALS_1cb757d7b50a, ALS_d5d5d593863b, ALS_3c1adaa8758d, ALS_6c0d22fb2e96, ALS_721976a90760, ALS_928d66ca0bdd, ALS_f25c2e118c90, ALS_685ebf3350a2, ALS_6ae6f4889021, ALS_ae70b9ef5776, ALS_b1bcc0715909, ALS_aee901af5d6f, ALS_9c935460b57a, ALS_e9b8c6305e94, ALS_f58ce5e70d91, ALS_4b6d5585eff3, ALS_a2e7eaad3c54, ALS_835170778132, ALS_a8398acb3218, ALS_abf0cbae7998, ALS_35a226d32a82, ALS_1a3d3e2f9307, ALS_7253f4d795e2, ALS_9cdead0ad7e5, ALS_7ae4b71e737e, ALS_c6688d1c2f3b, ALS_0c45a6b0d63e, ALS_f6d4ab44317f, ALS_a03d3d86bec4, ALS_946fc3fc8469, ALS_5f42acc8462e, ALS_84a218f12c2c, ALS_0cefd72f2458, ALS_284328efb959, ALS_db068cfef072, ALS_9a5eca83ec70, ALS_fe2d7d10f377, ALS_05d31a0570d1, ALS_8991874169f9, ALS_a1f5274093cc, ALS_0d712edd6c83, ALS_5caf48c1cd03, ALS_952446eef5cb, ALS_8192407f54be, ALS_40b7f40a0bae, ALS_11355873a821, ALS_4840ecade58d, ALS_ccd5c444214b, ALS_fee5146ff8c1, ALS_6db28d3e0ff8, ALS_df3f80def4db, ALS_8859a7b3e40e, ALS_1ab331cf2a9d, ALS_9f9ddcabf9f2, ALS_5a2a6

True

# Running a cross-validated implicit ALS model

Now that we have several ALS models, each with a different set of hyperparameter values, we can train them on a training portion of the msd dataset using cross validation, and then run them on a test set of data and evaluate how well each one performs using the ROEM function

In [12]:
# # ROEM is a metric used to evaluate the performance of recommendation systems for implicit ratings of ALS algorithm.
# # ROEM stands for Rank Ordering Error Metric 
# # Unfortunately, pyspark do not provide native support for ROEM.
# # Here is a custom implementation of ROEM

# def ROEM(predictions, userCol="userId", itemCol="songId", ratingCol="num_plays"):
#     # Create table that can be queried
#     predictions.createOrReplaceTempView("predictions")
#     # Sum of total number of plays of all songs
#     denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]
#     # Calculating rankings of songs predictions by user
#     spark.sql(
#         "SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions"
#     ).createOrReplaceTempView("rankings")
#     # Multiplies the rank of each song by the number of plays and adds the products together
#     numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0]
#     # Compute ROEM
#     roem = numerator / denominator
#     return roem
    
# # Split the data into training and test sets
# (training, test) = msd.randomSplit([0.8, 0.2])
# #Building 5 folds within the training set.
# train1, train2, train3, train4, train5 = training.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
# fold1 = train2.union(train3).union(train4).union(train5)
# fold2 = train3.union(train4).union(train5).union(train1)
# fold3 = train4.union(train5).union(train1).union(train2)
# fold4 = train5.union(train1).union(train2).union(train3)
# fold5 = train1.union(train2).union(train3).union(train4)

# foldlist = [(fold1, train1), (fold2, train2), (fold3, train3), (fold4, train4), (fold5, train5)]

# # Empty list to fill with ROEMs from each model
# ROEMS = []

# # Loops through all models and all folds
# for model in model_list:
#     for ft_pair in foldlist:
#         # Fits model to fold within training data
#         fitted_model = model.fit(ft_pair[0])
#         # Generates predictions using fitted_model on respective CV test data
#         predictions = fitted_model.transform(ft_pair[1])
#         # Generates and prints a ROEM metric CV test data
#         r = ROEM(predictions)
#         print ("ROEM: ", r)
#     # Fits model to all of training data and generates preds for test data
#     v_fitted_model = model.fit(training)
#     v_predictions = v_fitted_model.transform(test)
#     v_ROEM = ROEM(v_predictions)
#     # Adds validation ROEM to ROEM list
#     ROEMS.append(v_ROEM)
#     print ("Validation ROEM: ", v_ROEM)

In [13]:
ROEMS = [0.22772277227722793,
 0.3762376237623761,
 0.29702970297029685,
 0.4455445544554455,
 0.13861386138613874,
 0.12871287128712883,
 0.3663366336633662]
# Import numpy
import numpy

# Find the index of the smallest ROEM
i = numpy.argmin(ROEMS)
print("Index of smallest ROEM:", i)

# Find ith element of ROEMS
print("Smallest ROEM: ", ROEMS[i])

Index of smallest ROEM: 5
Smallest ROEM:  0.12871287128712883


# Extracting parameters

You've now tested 192 different models on the msd dataset, and you found the best ROEM and its respective model (model 38).

You now need to extract the hyperparameters. The model_list you created previously is provided here. It contains all 192 models you generated.

In [14]:
# # Extract the best_model
# best_model = model_list[38]

# # Extract the Rank
# print ("Rank: ", best_model.getRank())

# # Extract the MaxIter value
# print ("MaxIter: ", best_model.getMaxIter())

# # Extract the RegParam value
# print ("RegParam: ", best_model.getRegParam())

# # Extract the Alpha value
# print ("Alpha: ", best_model.getAlpha())

# Binary model performance

Let's look at the binary_test_predictions from this model to see what we can learn.

In [15]:
# # Import the col function
# from pyspark.sql.functions import col

# # Look at the test predictions
# binary_test_predictions.show()

# # Evaluate ROEM on test predictions
# ROEM(binary_test_predictions)

# # Look at user 42's test predictions
# binary_test_predictions.filter(col("userId") == 42).show()

# Recommendations from binary data

So you see from the ROEM, these models can still generate meaningful test predictions. Let's look at the actual recommendations now.

In [16]:
# # View user 26's original ratings
# print ("User 26 Original Ratings:")
# original_ratings.filter(col("userId") == 26).show()

# # View user 26's recommendations
# print ("User 26 Recommendations:")
# binary_recs.filter(col("userId") == 26).show()

# # View user 99's original ratings
# print ("User 99 Original Ratings:")
# original_ratings.filter(col("userId") == 99).show()

# # View user 99's recommendations
# print ("User 99 Recommendations:")
# binary_recs.filter(col("userId") == 99).show()