In [1]:
#a. Prepare Data

#Load the data from the ratings.csv and movies.csv files and combine them on movieId. The resultant data set should contain all of the user ratings and include movie titles. The schema should look something like this.

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql.functions import col

In [3]:
spark = SparkSession.builder.appName('Week11').getOrCreate()

In [4]:
movies = spark.read.load(r'/FileStore/tables/week11/movies.csv', format='csv', inferSchema=True, header=True)
ratings = spark.read.load(r'/FileStore/tables/week11/ratings.csv', format='csv', inferSchema=True, header=True)

In [5]:
df = movies.join(ratings, 'movieId', 'inner')

In [6]:
df.show()

In [7]:
#b. Train Recommender

#Using the data you prepared in the last step, create a movie recommendation model using collaborative filtering. Spark’s collaborative filtering documentation provides a template for building and testing this model.

#Before you train the recommendation model, split the data into a training dataset and a testing dataset using the randomSplit dataframe method. Use 80% of your data for training and 20% for testing.

#After fitting your model using the training dataset, calculate the predictions on the test dataset and use the RegressionEvaluator to calculate the root-mean-square error of the model.

#As a reminder, Spark’s collaborative filtering documentation will be helpful in completing this task.

In [8]:
(training, test) = df.randomSplit([0.8, 0.2])

In [9]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [10]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(str(rmse))

In [11]:
#c. Generate top 10 movie recommendations

#Using the recommendation model, generate the top ten recommendations for each user. Using the show method, print the recommendations for the user IDs, 127, 151, and 300. You should not truncate the results and so should call the show method like this recommendations_127.show(truncate=False).

In [12]:
user_ids = [127,151,300]
users = ratings.select(als.getUserCol()).distinct().where(col('userId').isin(user_ids))
userRecs = model.recommendForUserSubset(users, 10)

In [13]:
userRecs.show(truncate=False)

In [14]:
for user_row in userRecs.collect():
    recommendations = []
    
    for rec in user_row['recommendations']:
        name = movies.select('title').where(movies.movieId==rec[0])
        recommendations.append({'name':name.collect()[0][0]})

    print(f'\nRecommendations for user {user_row[0]}\n')
    [print(f"{row['name']}") for row in recommendations]
