**1. Setup**

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions
from pyspark import SparkConf
from pyspark.context import SparkContext

spark = SparkSession.builder \
    .appName("Project") \
    .master("spark://10.10.28.60:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
    # .config("spark.driver.memory", "2g") \
    # .config("spark.executor.memory", "2g") \
    
print(spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/27 09:19:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/27 09:19:11 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


3.5.1




In [2]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

from pyspark.sql.types import *

import pandas as pd
import numpy as np

**2. Data Preparation**




In [3]:
movies = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/movies.parquet')
ratings = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/ratings.parquet')
tags = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/tags.parquet')

                                                                                

In [4]:
df_movies = movies
df_ratings = ratings
df_tags = tags

In [5]:
df_movies.createOrReplaceTempView("movies")
df_ratings.createOrReplaceTempView("ratings")
df_tags.createOrReplaceTempView("tags")

In [6]:
df_ratings = df_ratings.drop('timestamp')
df_ratings = df_ratings.dropna(subset=['userId', 'movieId'])
df_ratings = df_ratings.withColumn("userId", df_ratings["userId"].cast("int"))
df_ratings = df_ratings.withColumn("movieId", df_ratings["movieId"].cast("int"))

In [7]:
model_path = 'hdfs://master5:9000/user/dis/output-4'

In [7]:
(train, test) = df_ratings.randomSplit([0.8, 0.2], seed=123)

In [8]:
alsb = ALS(rank=10, maxIter=10, regParam=0.03, userCol="userId", itemCol="movieId", ratingCol="rating", \
               coldStartStrategy="drop")
alsb_model = alsb.fit(train)

alsb_predictions = alsb_model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(alsb_predictions)
print("Root-mean-square error = " + str(rmse))

# save the ALS model
#alsb_model.save(model_path + 'als')

# 20,5,0.1 Root-mean-square error = 0.8226567600924868
# 10,10,0.3 Root-mean-square error = 0.9065847323471237
# 10,10,0.2 Root-mean-square error = 0.8576249876087348

24/05/26 15:56:10 ERROR TaskSchedulerImpl: Lost executor 1 on 10.10.28.60: Command exited with code 137
24/05/26 15:56:10 WARN TaskSetManager: Lost task 2.0 in stage 5.0 (TID 13) (10.10.28.60 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Command exited with code 137
24/05/26 15:56:10 WARN TaskSetManager: Lost task 5.0 in stage 5.0 (TID 16) (10.10.28.60 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Command exited with code 137
24/05/26 15:56:10 WARN TaskSetManager: Lost task 2.1 in stage 5.0 (TID 18) (10.10.28.61 executor 2): FetchFailed(BlockManagerId(1, 10.10.28.60, 36701, None), shuffleId=1, mapIndex=0, mapId=4, reduceId=2, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:437)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1233)

Root-mean-square error = 0.8063740141561199


                                                                                

In [14]:
alsb_model = ALSModel.load(model_path + 'als')

In [15]:
alsn_model = ALSModel.read().load(model_path+ 'als')

#Get 5 recommends for user
userRecoms = alsn_model.recommendForAllUsers(5)


In [17]:
userRecoms.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{175275, 6.86276...|
|     3|[{200930, 7.04948...|
|     5|[{193063, 6.79898...|
|     6|[{225435, 7.52915...|
|     9|[{225435, 5.98800...|
|    12|[{225435, 8.80683...|
|    13|[{178501, 7.54789...|
|    15|[{193063, 6.26425...|
|    16|[{216663, 7.45282...|
|    17|[{126941, 8.21922...|
|    19|[{178727, 5.24674...|
|    20|[{199187, 8.11858...|
|    22|[{160824, 6.26756...|
|    26|[{173651, 6.97203...|
|    27|[{173651, 6.18857...|
|    28|[{116847, 5.61974...|
|    31|[{177209, 5.32568...|
|    34|[{185211, 6.33270...|
|    35|[{200930, 5.75171...|
|    37|[{222368, 6.95421...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [19]:
userRecoms.write.mode('overwrite').parquet(model_path + "recom_als")

                                                                                

In [8]:
recommendation = spark.read.parquet(model_path + "recom_als")
recommendation.show(23)

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{175275, 6.86276...|
|     3|[{200930, 7.04948...|
|     5|[{193063, 6.79898...|
|     6|[{225435, 7.52915...|
|     9|[{225435, 5.98800...|
|    12|[{225435, 8.80683...|
|    13|[{178501, 7.54789...|
|    15|[{193063, 6.26425...|
|    16|[{216663, 7.45282...|
|    17|[{126941, 8.21922...|
|    19|[{178727, 5.24674...|
|    20|[{199187, 8.11858...|
|    22|[{160824, 6.26756...|
|    26|[{173651, 6.97203...|
|    27|[{173651, 6.18857...|
|    28|[{116847, 5.61974...|
|    31|[{177209, 5.32568...|
|    34|[{185211, 6.33270...|
|    35|[{200930, 5.75171...|
|    37|[{222368, 6.95421...|
|    40|[{173655, 7.05399...|
|    41|[{225435, 6.02123...|
|    43|[{222368, 7.00660...|
+------+--------------------+
only showing top 23 rows



In [9]:
def get_recommendations(user_id):
    recs = recommendation.filter(col("userId") == user_id).select("recommendations")
    recs = recs.select(explode(col("recommendations")).alias("rec")).select("rec.movieId", "rec.rating")
    item_list = recs.orderBy(col("rating").desc()).select("movieId").rdd.flatMap(lambda x: x).collect()
    return item_list


In [10]:
for i in range(1, 101):
    result = get_recommendations(i)
    print(f'Recommend movies for user {i}: ')
    print(result)

                                                                                

Recommend movies for user 1: 
[175275, 126941, 225435, 225429, 222368]


                                                                                

Recommend movies for user 2: 
[229615, 176657, 183053, 181873, 179063]
Recommend movies for user 3: 
[200930, 178501, 125938, 231289, 231287]
Recommend movies for user 4: 
[216663, 173655, 206160, 104119, 80195]
Recommend movies for user 5: 
[193063, 160824, 236067, 225435, 225429]


                                                                                

Recommend movies for user 6: 
[225435, 225429, 225437, 225425, 126941]
Recommend movies for user 7: 
[126941, 175275, 225435, 225429, 222368]
Recommend movies for user 8: 
[193063, 122015, 236067, 7699, 160824]
Recommend movies for user 9: 
[225435, 225429, 222368, 126941, 175275]
Recommend movies for user 10: 
[126941, 193063, 225435, 225429, 160824]
Recommend movies for user 11: 
[240054, 160824, 222368, 193817, 231289]
Recommend movies for user 12: 
[225435, 225429, 225437, 225425, 147734]
Recommend movies for user 13: 
[178501, 160824, 200930, 222368, 98221]
Recommend movies for user 14: 
[160824, 214148, 148886, 98221, 248830]
Recommend movies for user 15: 
[193063, 155020, 228769, 215635, 160824]
Recommend movies for user 16: 
[216663, 175275, 169604, 125938, 175625]
Recommend movies for user 17: 
[126941, 178501, 175275, 181333, 185373]
Recommend movies for user 18: 
[236067, 223978, 225435, 225429, 173651]
Recommend movies for user 19: 
[178727, 182521, 222007, 243374, 191943]


In [18]:
userRecoms.count()

                                                                                

329033