In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

---

### Import Libraries

In [3]:
from time import time
import csv
import os

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.recommendation import ALS,ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

### Schema Writing & Data Loading

In [4]:
schema_ratings = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("user_id", IntegerType(), False),
    StructField("rating", FloatType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("title", StringType(), False)])

# schema_reviews = StructType([
#     StructField("item_id", IntegerType(), False),
#     StructField("txt", StringType(), False)])


ratings_df = spark.read.json("/user/itv009301/movieDataset/ratings.json", schema=schema_ratings)
metadata_df = spark.read.json("/user/itv009301/movieDataset/metadata.json", schema=schema_items)
# reviews_df = spark.read.json("/user/itv009301/movieDataset/reviews.json", schema=schema_reviews)
# metadata_updated_df = spark.read.option('inferschema', 'true').json("/user/itv009301/movieDataset/metadata_updated.json")

### Filter ratings dataframe
>Filter records which are present in both ratings.json and metadata.json

In [5]:
filtered_ratings_df = metadata_df.alias("m").join(
    ratings_df.alias("r"),
    metadata_df["item_id"] == ratings_df["item_id"],
    "inner").select("r.*")
# print("Number of records before filtering: ",ratings_df.count())
# print("\nNumber of records after filtering: ",filtered_ratings_df.count())

### Take out test set

>When training recommendation algorithms, the traditional random split of data into training and testing sets may not be suitable. In the context of recommendation systems, we need to handle the unique challenge of ensuring that all users are present in both the training and testing datasets.
To address this issue, a common approach is to perform a user-level dataset split. Instead of randomly dividing the data at the individual data point level, we mask a certain percentage of items for each user and use these masked items as the testing set. Specifically, we randomly hide a portion (e.g., 20%) of the items that each user has interacted with in the training set.


In [6]:
# filtered_ratings_df = filtered_ratings_df.withColumn("num_items", expr("count(*) over (partition by user_id)"))

# # 20% of items will be masked
# percent_items_to_mask = 0.2

# # Determine the number of items to mask for each user
# filtered_ratings_df = filtered_ratings_df.withColumn("num_items_to_mask", \
#                                                      (col("num_items") * percent_items_to_mask).cast("int"))
# # Masks items for each user
# filtered_ratings_df = filtered_ratings_df.withColumn("item_rank", \
#                                           rank().over(Window.partitionBy("user_id").orderBy(col("item_id").desc())))

training_df, testing_df = filtered_ratings_df.randomSplit([0.8, 0.2])

------

### Model Training

#### Model Parameters


In [7]:
rank = 200
numIterations = 30
regParameter = 0.05
nonNegativity = False

In [None]:
print('Model training in process.....')
t0 = time()

# Initialize the ALS model
als = ALS(maxIter=numIterations \
          , rank=rank \
          , regParam=regParameter\
          , userCol="user_id" \
          , itemCol="item_id"
          , ratingCol="rating"
          , coldStartStrategy="drop"
          , nonnegative=nonNegativity)

# Fit the model
model = als.fit(training_df)


print("Model trained in seconds", time()-t0)

Model training in process.....


### Save the model

In [None]:
modelPath = f"/user/itv009301/model/rank_{rank}_lambda_{regParameter}_model"
model.save(modelPath)

### Model Evaluation

>User-level split allows us to ensure that the testing set contains users, which were part of training process, which is critical for evaluating the performance of the recommendation system accurately. By using metrics such as precision and NDCG on the masked items, we can compare the predicted recommendations with the actual hidden items to assess the quality and effectiveness of the recommendation model. This method provides a more realistic evaluation of the recommendation system’s performance in a scenario where some user-item interactions are unknown during training but need to be predicted during testing.

In [13]:
# Make recommendation on test dataset (if any users are unseen during training then they will be dropped becasue we have set coldStartStrategy='drop')
t0 = time()

predictions = model.transform(testing_df)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print(f"RMSE without Grid Search: {rank}, {numIterations}, {regParameter},{nonNegativity}, {rmse}")

print("\nError Calculated in seconds:", time()-t0)

predictions.show()

Py4JJavaError: An error occurred while calling o288.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 31 (rdd at RegressionEvaluator.scala:125) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: java.lang.IllegalArgumentException: Unknown message type: 9 	at org.apache.spark.network.shuffle.protocol.BlockTransferMessage$Decoder.fromByteBuffer(BlockTransferMessage.java:71) 	at org.apache.spark.network.shuffle.ExternalShuffleBlockHandler.receive(ExternalShuffleBlockHandler.java:80) 	at org.apache.spark.network.server.TransportRequestHandler.processRpcRequest(TransportRequestHandler.java:180) 	at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:103) 	at org.apache.spark.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:118) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) 	at org.spark_project.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) 	at org.spark_project.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at org.spark_project.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at org.spark_project.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	at java.lang.Thread.run(Thread.java:748)  	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:770) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:685) 	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:70) 	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29) 	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) 	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) 	at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) 	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.sort_addToSorter_0$(Unknown Source) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755) 	at org.apache.spark.sql.execution.RowIteratorFromScala.advanceNext(RowIterator.scala:83) 	at org.apache.spark.sql.execution.joins.SortMergeJoinScanner.advancedStreamed(SortMergeJoinExec.scala:800) 	at org.apache.spark.sql.execution.joins.SortMergeJoinScanner.findNextOuterJoinRows(SortMergeJoinExec.scala:759) 	at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceStream(SortMergeJoinExec.scala:925) 	at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceNext(SortMergeJoinExec.scala:961) 	at org.apache.spark.sql.execution.RowIteratorToScala.hasNext(RowIterator.scala:68) 	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage6.processNext(Unknown Source) 	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755) 	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) 	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132) 	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) 	at org.apache.spark.scheduler.Task.run(Task.scala:131) 	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) 	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 	at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.RuntimeException: java.lang.IllegalArgumentException: Unknown message type: 9 	at org.apache.spark.network.shuffle.protocol.BlockTransferMessage$Decoder.fromByteBuffer(BlockTransferMessage.java:71) 	at org.apache.spark.network.shuffle.ExternalShuffleBlockHandler.receive(ExternalShuffleBlockHandler.java:80) 	at org.apache.spark.network.server.TransportRequestHandler.processRpcRequest(TransportRequestHandler.java:180) 	at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:103) 	at org.apache.spark.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:118) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.spark_project.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at org.spark_project.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at org.spark_project.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) 	at org.spark_project.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) 	at org.spark_project.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) 	at org.spark_project.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at org.spark_project.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at org.spark_project.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	at java.lang.Thread.run(Thread.java:748)  	at org.apache.spark.network.client.TransportResponseHandler.handle(TransportResponseHandler.java:208) 	at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:142) 	at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:53) 	at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:102) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357) 	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379) 	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365) 	at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) 	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:163) 	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:714) 	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) 	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) 	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) 	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) 	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) 	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) 	... 1 more 
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1768)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2442)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2291)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.mllib.stat.Statistics$.colStats(Statistics.scala:58)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:70)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:62)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:74)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:74)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:106)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:115)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:101)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)


------

### Load the model

In [23]:
modelPath = f"/user/itv009301/model/rank_{rank}_lambda_{regParameter}_model"
model = ALSModel.load(modelPath)

----

#### Show Users' historical ratings

In [27]:
user = 958971
df_with_lowest_user_id = filtered_ratings_df.where(filtered_ratings_df.user_id==user).orderBy(["rating"], ascending=False)

In [28]:
metadata_df.alias("m").join(
    df_with_lowest_user_id.alias("r"),
    metadata_df["item_id"] == df_with_lowest_user_id["item_id"],
    "inner").collect()

[Row(item_id=1466, title='Donnie Brasco (1997)', item_id=1466, user_id=958971, rating=3.5),
 Row(item_id=2542, title='Lock, Stock & Two Smoking Barrels (1998)', item_id=2542, user_id=958971, rating=4.0),
 Row(item_id=2396, title='Shakespeare in Love (1998)', item_id=2396, user_id=958971, rating=3.5),
 Row(item_id=2006, title='Mask of Zorro, The (1998)', item_id=2006, user_id=958971, rating=3.5),
 Row(item_id=4246, title="Bridget Jones's Diary (2001)", item_id=4246, user_id=958971, rating=4.0),
 Row(item_id=1267, title='Manchurian Candidate, The (1962)', item_id=1267, user_id=958971, rating=4.0),
 Row(item_id=4995, title='Beautiful Mind, A (2001)', item_id=4995, user_id=958971, rating=4.0),
 Row(item_id=2770, title='Bowfinger (1999)', item_id=2770, user_id=958971, rating=3.5),
 Row(item_id=6539, title='Pirates of the Caribbean: The Curse of the Black Pearl (2003)', item_id=6539, user_id=958971, rating=4.0),
 Row(item_id=2395, title='Rushmore (1998)', item_id=2395, user_id=958971, rating

#### Get Recommendations for selected user_id

In [24]:
user_id = 0
n_of_recommendations = 30

subset_data = [(user_id,)]
subset_columns = ["user_id"]
subset_df = spark.createDataFrame(subset_data, subset_columns)


# Generate collaborative filtering recommendations
collaborative_recommendations = model.recommendForUserSubset(subset_df, n_of_recommendations)

# Explode the array of recommendations to individual rows
exploded_df = collaborative_recommendations.select("user_id", explode("recommendations").alias("recommendation"))

# Extract item_id and rating columns for the join
result_df = exploded_df.select(
    "user_id",
    "recommendation.item_id",
    "recommendation.rating"
)

# Join with metadata_df on 'item_id'
joined_df = result_df.join(metadata_df, on="item_id", how="inner")

# Show the final result
joined_df.orderBy(["rating"], ascending=False).show(30,truncate=False)

+-------+-------+---------+------------------------------------------------------------------------------+
|item_id|user_id|rating   |title                                                                         |
+-------+-------+---------+------------------------------------------------------------------------------+
|188925 |0      |6.7725663|8 Murders a Day (2011)                                                        |
|188923 |0      |6.095309 |49 Pulses (2017)                                                              |
|165421 |0      |5.501858 |The Red Pill (2016)                                                           |
|2609   |0      |4.996947 |King of Masks, The (Bian Lian) (1996)                                         |
|4993   |0      |4.991012 |Lord of the Rings: The Fellowship of the Ring, The (2001)                     |
|1198   |0      |4.9627814|Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)|
|7153   |0      |4.9344115|Lord of th

------

In [25]:
user_id = 552403
n_of_recommendations = 30

subset_data = [(user_id,)]
subset_columns = ["user_id"]
subset_df = spark.createDataFrame(subset_data, subset_columns)


# Generate collaborative filtering recommendations
collaborative_recommendations = model.recommendForUserSubset(subset_df, n_of_recommendations)

# Explode the array of recommendations to individual rows
exploded_df = collaborative_recommendations.select("user_id", explode("recommendations").alias("recommendation"))

# Extract item_id and rating columns for the join
result_df = exploded_df.select(
    "user_id",
    "recommendation.item_id",
    "recommendation.rating"
)

# Join with metadata_df on 'item_id'
joined_df = result_df.join(metadata_df, on="item_id", how="inner")

# Show the final result
joined_df.orderBy(["rating"], ascending=False).show(30,truncate=False)

+-------+-------+---------+---------------------------------------------------------------------------+
|item_id|user_id|rating   |title                                                                      |
+-------+-------+---------+---------------------------------------------------------------------------+
|188925 |552403 |5.992783 |8 Murders a Day (2011)                                                     |
|188923 |552403 |5.393504 |49 Pulses (2017)                                                           |
|136445 |552403 |4.890968 |George Carlin: Back in Town (1996)                                         |
|32649  |552403 |4.886404 |Special Day, A (Giornata particolare, Una) (1977)                          |
|26073  |552403 |4.8167443|Human Condition III, The (Ningen no joken III) (1961)                      |
|45412  |552403 |4.7861447|Hidden Blade, The (Kakushi ken oni no tsume) (2004)                        |
|750    |552403 |4.7584987|Dr. Strangelove or: How I Learned to 

In [26]:
user_id = 700445
n_of_recommendations = 30

subset_data = [(user_id,)]
subset_columns = ["user_id"]
subset_df = spark.createDataFrame(subset_data, subset_columns)


# Generate collaborative filtering recommendations
collaborative_recommendations = model.recommendForUserSubset(subset_df, n_of_recommendations)

# Explode the array of recommendations to individual rows
exploded_df = collaborative_recommendations.select("user_id", explode("recommendations").alias("recommendation"))

# Extract item_id and rating columns for the join
result_df = exploded_df.select(
    "user_id",
    "recommendation.item_id",
    "recommendation.rating"
)

# Join with metadata_df on 'item_id'
joined_df = result_df.join(metadata_df, on="item_id", how="inner")

# Show the final result
joined_df.orderBy(["rating"], ascending=False).show(30,truncate=False)

+-------+-------+---------+------------------------------------------------------------------------------+
|item_id|user_id|rating   |title                                                                         |
+-------+-------+---------+------------------------------------------------------------------------------+
|188925 |700445 |7.092567 |8 Murders a Day (2011)                                                        |
|188923 |700445 |6.3833094|49 Pulses (2017)                                                              |
|184299 |700445 |5.2092605|Freedom on My Mind (1994)                                                     |
|204304 |700445 |5.0758643|Leningrad: Kolshik (2017)                                                     |
|171495 |700445 |5.0580797|Cosmos                                                                        |
|166812 |700445 |5.0089564|Seeing Red: Stories of American Communists (1983)                             |
|202936 |700445 |4.9813156|ReMoved (2

---

## Note:
* It is recommeding same movies for every user, users are not new they were part of training, so there is some error with the code that needs to be fixed.
* After fixing the issue of overfitting, we are getting some movies rating >5.00, we will remove those recommendations. As they appear for every user and don't align with users' history.