In [1]:
#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

In [2]:
#CSV file can be downloaded from the link mentioned above.
data = spark.read.csv('book_ratings.csv', inferSchema=True,header=True)
data.show(5)
data.describe().show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows

+-------+-----------------+------------------+------------------+
|summary|          book_id|           user_id|            rating|
+-------+-----------------+------------------+------------------+
|  count|           981756|            981756|            981756|
|   mean|4943.275635697668|25616.759933221696|3.8565335989797873|
| stddev|2873.207414896197| 15228.33882588251|0.9839408559620116|
|    min|                1|                 1|                 1|
|    max|            10000|             53424|                 5|
+-------+-----------------+------------------+------------------+



In [3]:
# Dividing the data using random split into train_data and test_data
# in 80% and 20% respectively
train_data,test_data=data.randomSplit([0.8, 0.2])

In [4]:
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,regParam=0.01,userCol="user_id",itemCol="book_id",ratingCol="rating")
#Fitting the model on the train_data
model=als.fit(train_data)

In [5]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)
#Displaying predictions calculated by the model
predictions.show()

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|   1169|     4| 3.7025928|
|      1|   3922|     5|  4.678804|
|      1|  10140|     4| 3.2282882|
|      1|  10146|     5|  4.014236|
|      1|  16377|     4|  4.925151|
|      1|  17984|     5| 4.1260147|
|      1|  21713|     5|  4.800112|
|      1|  23612|     4| 4.1313086|
|      1|  24845|     5| 4.0538073|
|      1|  25214|     4| 5.6126328|
|      1|  26145|     4| 3.7452974|
|      1|  29123|     3| 3.0765924|
|      1|  29703|     5| 3.8599162|
|      1|  30681|     5| 4.3942995|
|      1|  32305|     5| 3.4297605|
|      1|  33065|     4| 4.7023225|
|      1|  38082|     5|  5.106575|
|      1|  41074|     3| 3.8541207|
|      1|  42404|     5|  4.964979|
|      1|  44243|     2|  3.934299|
+-------+-------+------+----------+
only showing top 20 rows



In [6]:
#Printing and calculating RMSE
evaluator = RegressionEvaluator(metricName="rmse",
labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [7]:
#Filtering user with user id "5461" with book id on which it has given the reviews
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id'])
#Displaying user1 data
user1.show()

#Traning and evaluating for user1 with our model trained with the help of training data
recommendations = model.transform(user1)
#Displaying the predictions of books for user1
recommendations.orderBy('prediction',ascending=False).show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|      9|   5461|
|     15|   5461|
|     16|   5461|
|     33|   5461|
|     35|   5461|
|     38|   5461|
|     82|   5461|
|    100|   5461|
|    111|   5461|
|    118|   5461|
|    148|   5461|
|    172|   5461|
|    181|   5461|
|    227|   5461|
|    233|   5461|
|    251|   5461|
|    264|   5461|
|    358|   5461|
|    395|   5461|
|    401|   5461|
+-------+-------+
only showing top 20 rows

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|    100|   5461|  5.090668|
|   1644|   5461|  4.983655|
|    233|   5461|  4.830121|
|   3613|   5461| 4.7577944|
|     38|   5461| 4.6801224|
|     82|   5461| 4.5605154|
|    181|   5461|  4.490892|
|   3889|   5461| 4.4586506|
|     15|   5461| 4.4518332|
|    172|   5461|  4.445932|
|    844|   5461| 4.4169536|
|    358|   5461| 4.4014845|
|    118|   5461| 4.4006243|
|    863|   5461|  4.390333|
|    515|   5461|  4.382334|
|    489|   5461| 4.

In [8]:
spark.stop()