**1. Setup**

In [None]:
# Start Spark
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions
from pyspark import SparkConf
from pyspark.context import SparkContext

spark = SparkSession.builder \
    .appName("Project") \
    .master("spark://10.10.28.60:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
    # .config("spark.driver.memory", "2g") \
    # .config("spark.executor.memory", "2g") \
    
print(spark.version)

In [None]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

from pyspark.sql.types import *

import pandas as pd
import numpy as np

**2. Data Preparation**




In [None]:
#Read data
movies = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/movies.parquet')
ratings = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/ratings.parquet')
tags = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/tags.parquet')

In [None]:
#Create a temporary view for to access data using SQL-like queries
df_movies = movies
df_ratings = ratings
df_tags = tags
df_movies.createOrReplaceTempView("movies")
df_ratings.createOrReplaceTempView("ratings")
df_tags.createOrReplaceTempView("tags")

In [None]:
#remove 'timestamp' column and turn userID and movieId to integer
df_ratings = df_ratings.drop('timestamp')
df_ratings = df_ratings.dropna(subset=['userId', 'movieId'])
df_ratings = df_ratings.withColumn("userId", df_ratings["userId"].cast("int"))
df_ratings = df_ratings.withColumn("movieId", df_ratings["movieId"].cast("int"))

In [None]:
model_path = 'hdfs://master5:9000/user/dis/output-4'

In [None]:
#Split data
(train, test) = df_ratings.randomSplit([0.8, 0.2], seed=123)

In [None]:
# Initialize the ALS (Alternating Least Squares) recommender model
alsb = ALS(rank=15, maxIter=15, regParam=0.05, userCol="userId", itemCol="movieId", ratingCol="rating", \
               coldStartStrategy="drop")
# Train the ALS model on the training data
alsb_model = alsb.fit(train)

# Evaluate the trained ALS model on the test data
alsb_predictions = alsb_model.transform(test)
#Calculate RMSE and print
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(alsb_predictions) 
print("Root-mean-square error = " + str(rmse))

# save the ALS model
alsb_model.save(model_path + 'als')

In [None]:
# Load the previously saved ALS model
alsn_model = ALSModel.read().load(model_path+ 'als')

#Get 5 recommends for user
userRecoms = alsn_model.recommendForAllUsers(5)


In [None]:
# Write the user recommendations to a Parquet file
userRecoms.write.mode('overwrite').parquet(model_path + "recom_als")

In [None]:
# Load the saved recommendations from the Parquet file
recommendation = spark.read.parquet(model_path + "recom_als")

In [None]:
def get_recommendations(user_id):
    """
    Get the top recommended movie IDs for a given user ID.
    
    Args:
        user_id (int): The ID of the user for whom to get recommendations.
        
    Returns:
        list: A list of the top recommended movie IDs for the given user.
    """
    recs = recommendation.filter(col("userId") == user_id).select("recommendations")
    recs = recs.select(explode(col("recommendations")).alias("rec")).select("rec.movieId", "rec.rating")
    item_list = recs.orderBy(col("rating").desc()).select("movieId").rdd.flatMap(lambda x: x).collect()
    return item_list


In [None]:
#Recommendation for 100 users
for i in range(1, 101):
    result = get_recommendations(i)
    print(f'Recommend movies for user {i}: ')
    print(result)