###  Book Recommendation with Pyspark (Collaborative recocmmendation system) 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,LongType,DoubleType
from pyspark.sql.functions import col
import pandas as pd
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [2]:
spark = SparkSession.builder.config("spark.executor.heartbeatInterval", "10000s").appName('Recommender').getOrCreate()
spark

In [3]:
sc = spark.sparkContext

In [4]:
books=pd.read_csv('BX-Books.csv',sep =';', on_bad_lines='skip', encoding="iso-8859-1")
users=pd.read_csv('BX-users.csv',sep =';', on_bad_lines='skip', encoding="iso-8859-1")
ratings=pd.read_csv('BX-Book-Ratings.csv',sep =';', on_bad_lines='skip', encoding="iso-8859-1")

In [5]:
ratings_sdf = spark.createDataFrame(ratings) 

In [6]:
ratings_sdf.printSchema()

root
 |-- User-ID: long (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: long (nullable = true)



In [7]:
ratings_sdf.show()

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
| 276733|2080674722|          0|
| 276736|3257224281|          8|
| 276737|0600570967|          6|
| 276744|038550120X|          7|
| 276745| 342310538|         10|
| 276746|0425115801|          0|
| 276746|0449006522|          0|
| 276746|0553561618|          0|
| 276746|055356451X|          0|
| 276746|0786013990|          0|
| 276746|0786014512|          0|
| 276747|0060517794|          9|
| 276747|0451192001|          0|
| 276747|0609801279|          0|
| 276747|0671537458|          9|
+-------+----------+-----------+
only showing top 20 rows



In [8]:
user_sdf=spark.createDataFrame(users)

In [9]:
user_sdf.show()

+-------+--------------------+----+
|User-ID|            Location| Age|
+-------+--------------------+----+
|      1|  nyc, new york, usa| NaN|
|      2|stockton, califor...|18.0|
|      3|moscow, yukon ter...| NaN|
|      4|porto, v.n.gaia, ...|17.0|
|      5|farnborough, hant...| NaN|
|      6|santa monica, cal...|61.0|
|      7| washington, dc, usa| NaN|
|      8|timmins, ontario,...| NaN|
|      9|germantown, tenne...| NaN|
|     10|albacete, wiscons...|26.0|
|     11|melbourne, victor...|14.0|
|     12|fort bragg, calif...| NaN|
|     13|barcelona, barcel...|26.0|
|     14|mediapolis, iowa,...| NaN|
|     15|calgary, alberta,...| NaN|
|     16|albuquerque, new ...| NaN|
|     17|chesapeake, virgi...| NaN|
|     18|rio de janeiro, r...|25.0|
|     19|           weston, ,|14.0|
|     20|langhorne, pennsy...|19.0|
+-------+--------------------+----+
only showing top 20 rows



In [10]:
books_sdf= spark.createDataFrame(books)

In [11]:
ratings_sdf.printSchema()

root
 |-- User-ID: long (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: long (nullable = true)



In [12]:
books_sdf =books_sdf.drop("Image-URL-S", "Image-URL-M", "Image-URL-L")

In [13]:
books_sdf.show()

+----------+--------------------+--------------------+-------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|
+----------+--------------------+--------------------+-------------------+--------------------+
| 195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|
|   2005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|
|  60973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|
| 374157065|Flu: The Story of...|    Gina Bari Kolata|               1999|Farrar Straus Giroux|
| 393045218|The Mummies of Ur...|     E. J. W. Barber|               1999|W. W. Norton &amp...|
| 399135782|The Kitchen God's...|             Amy Tan|               1991|    Putnam Pub Group|
| 425176428|What If?: The Wor...|       Robert Cowley|               2000|Berkley Publishin...|
| 671870432|     PLEADING GUILTY|       

In [14]:
columns=['ISBN']
indexer = [StringIndexer(inputCol=column, outputCol=column+"Index") for column in columns]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(books_sdf).transform(books_sdf)
transformed.show()

+----------+--------------------+--------------------+-------------------+--------------------+---------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|ISBNIndex|
+----------+--------------------+--------------------+-------------------+--------------------+---------+
| 195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|  57260.0|
|   2005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|  57638.0|
|  60973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial| 177769.0|
| 374157065|Flu: The Story of...|    Gina Bari Kolata|               1999|Farrar Straus Giroux| 107195.0|
| 393045218|The Mummies of Ur...|     E. J. W. Barber|               1999|W. W. Norton &amp...| 119104.0|
| 399135782|The Kitchen God's...|             Amy Tan|               1991|    Putnam Pub Group| 124682.0|
| 425176428|What If?: The Wor...|       Robert

In [15]:
ratings_sdf_processed = ratings_sdf.join(transformed,"ISBN","inner")

In [16]:
ratings_sdf_processed.show()

+----------+-------+-----------+--------------------+----------------+-------------------+--------------------+---------+
|      ISBN|User-ID|Book-Rating|          Book-Title|     Book-Author|Year-Of-Publication|           Publisher|ISBNIndex|
+----------+-------+-----------+--------------------+----------------+-------------------+--------------------+---------+
|000220083X| 122881|          0|AMPHIBIANS AND RE...|   Trevor Beebee|               2000|    Trafalgar Square|     14.0|
|000225669x| 256247|          0|One Thousand Ches...|      Mira Stout|                  0|Harpercollins Pub...|     28.0|
|000225946X| 100782|          9|             Swimmer|     Bill Broady|               2000|            Flamingo|     31.0|
|000225946X| 145161|          0|             Swimmer|     Bill Broady|               2000|            Flamingo|     31.0|
|000255710X| 182987|          0|The Danakil Diary...|Wilfred Thesiger|               1996|Harpercollins Canada|     38.0|
|000470973X|  37644|    

In [17]:
ratings_sdf_processed.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ratings_sdf_processed.columns]
   ).show()

+----+-------+-----------+----------+-----------+-------------------+---------+---------+
|ISBN|User-ID|Book-Rating|Book-Title|Book-Author|Year-Of-Publication|Publisher|ISBNIndex|
+----+-------+-----------+----------+-----------+-------------------+---------+---------+
|   0|      0|          0|         0|          2|                  0|        2|        0|
+----+-------+-----------+----------+-----------+-------------------+---------+---------+



In [18]:
ratings_sdf_selected = ratings_sdf_processed.select(col("User-ID"), col("ISBNIndex"), col("Book-Rating"))

In [19]:
(training, test) = ratings_sdf_selected.randomSplit([0.8, 0.2])

In [22]:
als = ALS(userCol="User-ID", itemCol="ISBNIndex", ratingCol="Book-Rating", coldStartStrategy="drop", nonnegative = True,
implicitPrefs = False)
# model = als.fit(training)

In [23]:

params = ParamGridBuilder().addGrid(als.regParam, [.1, .15]).addGrid(als.rank, [10, 150]).build()

In [24]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="Book-Rating",predictionCol="prediction")

In [25]:
cv = CrossValidator(estimator = als, estimatorParamMaps = params, evaluator = evaluator, numFolds = 5)

In [26]:
model = cv.fit(training)

In [27]:
best_model = model.bestModel

In [28]:
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

In [29]:
print ("**Best Model**")
print (f"RMSE ={rmse} ")
print (f"Rank:{best_model.rank}") 
print (f" MaxIter:{best_model._java_obj.parent().getMaxIter()}")
print (f" RegParam:{best_model._java_obj.parent().getRegParam()}")


**Best Model**
RMSE =4.072090555089141 
Rank:150
 MaxIter:10
 RegParam:0.15


In [20]:
als = ALS(maxIter=5, regParam=0.15,rank=150, userCol="User-ID", itemCol="ISBNIndex", ratingCol="Book-Rating", coldStartStrategy="drop", nonnegative = True,
implicitPrefs = False)
model = als.fit(training)

In [21]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="Book-Rating",predictionCol="prediction")
predictions=model.transform(test).na.drop()
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=4.033819099536668
+-------+---------+-----------+----------+
|User-ID|ISBNIndex|Book-Rating|prediction|
+-------+---------+-----------+----------+
| 104665|     78.0|          0|       0.0|
|  28993|     81.0|          9| 1.4908936|
| 244995|    115.0|          4|0.26952899|
|  64316|    115.0|         10| 2.0175445|
| 142143|    115.0|          0| 0.8274771|
| 252222|    271.0|          0| 1.2461506|
| 175003|    296.0|          0| 0.5703545|
| 209756|    321.0|          0| 1.5183747|
| 148863|    321.0|          7| 2.5576854|
|  54622|    406.0|          5| 1.5856388|
| 110912|    406.0|          0| 1.1166079|
| 265313|    406.0|          7| 1.2144725|
| 197659|    406.0|          9| 2.5215569|
| 218608|    412.0|         10| 4.3565707|
| 104569|    436.0|          5| 1.3040322|
|  70594|    436.0|          8| 2.3849626|
|  96665|    436.0|          0|       0.0|
|  11676|    436.0|          8| 2.0258257|
|  33656|    436.0|          5|       0.0|
|  63714|    516.0|          0|

In [22]:
user_recs=model.recommendForAllUsers(20)

In [23]:
user_recs.show()

+-------+--------------------+
|User-ID|     recommendations|
+-------+--------------------+
|     44|[{59238, 0.0}, {5...|
|    183|[{239158, 9.02123...|
|    243|[{51436, 5.364575...|
|    300|[{47341, 7.423259...|
|    362|[{59238, 0.0}, {5...|
|    384|[{59238, 0.0}, {5...|
|    392|[{241533, 9.06003...|
|    406|[{59238, 0.0}, {5...|
|    460|[{256778, 7.84945...|
|    472|[{14301, 8.95139}...|
|    496|[{12247, 8.373371...|
|    626|[{82730, 6.665379...|
|    744|[{244181, 8.63125...|
|    811|[{91938, 7.959905...|
|    853|[{3731, 8.850786}...|
|    876|[{59238, 0.0}, {5...|
|    914|[{82093, 9.905464...|
|    973|[{40414, 9.952799...|
|   1025|[{83288, 11.28329...|
|   1294|[{28463, 8.348033...|
+-------+--------------------+
only showing top 20 rows



In [24]:
rec_list=user_recs.filter(col('User-ID')==44).select("recommendations").collect()

In [25]:
topBooks = []
for item in rec_list[0][0]:
    topBooks.append(item.ISBNIndex)

In [26]:
topBooks

[59238,
 59218,
 59208,
 59198,
 59188,
 59178,
 59168,
 59165,
 59158,
 59155,
 59148,
 59145,
 59138,
 59135,
 59128,
 59125,
 59118,
 59115,
 59105,
 59098]

In [27]:
schema = StructType([StructField("ISBN",IntegerType(),True)])
Recommended_book = spark.createDataFrame(topBooks, IntegerType())
Recommended_book = Recommended_book.join(transformed, Recommended_book.value == transformed.ISBNIndex).select(transformed["Book-Title"], transformed["Book-Author"])

In [28]:
Recommended_book.show()

+--------------------+--------------------+
|          Book-Title|         Book-Author|
+--------------------+--------------------+
|La Guerre Des Bou...|             Pergaud|
| Les racines du ciel|         Romain Gary|
|Histoires Extraor...|     Edgar Allan Poe|
|             Lespoir|       Andre Malraux|
|LA Peste (Folio S...|        Albert Camus|
|      Sauve qui peut|            SempÃ?Â©|
|L'Education Europ...|         Romain Gary|
|L'espion qui vena...|    John Le CarrÃ?Â©|
|Ennemonde et autr...|          Jean Giono|
|Le Bon Gros Geant...|          Roald Dahl|
|L' Education Sent...|    Gustave Flaubert|
|Lettres De Mon Mo...|              Daudet|
|            La pomme|Pascale de Bourgoing|
|Noces Suivi De L'...|        Albert Camus|
|   Illusions Perdues|    Honore de Balzac|
|Au-dessous du volcan|             Lowry M|
|            L'Ã?Â®le|        Robert Merle|
|La Chartreuse De ...|            Stendhal|
|Un roi sans diver...|          Jean Giono|
|Zazie Dans Le Met...|     Raymo

Reference

https://thecleverprogrammer.com/2020/05/23/book-recommendation-system-with-machine-learning/

https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3