In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [2]:
!pip install numpy --quiet

[0m

In [3]:
spark = SparkSession\
        .builder\
        .appName("book-recs")\
        .master("spark://spark-master:7077")\
        .config("spark.executor.memory", "512m")\
        .getOrCreate()

23/11/11 18:33:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
users_schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('location', StringType(), True), 
    StructField('age', FloatType(), True),
    StructField('_corrupt_record', StringType(), True)
])

users_df = spark.read.csv(
    path='data/Users.csv', 
    schema=users_schema,
    mode='PERMISSIVE',
    columnNameOfCorruptRecord='_corrupt_record',
    escape='"'
).cache()

In [5]:
corrupt_user_records = users_df.filter(~users_df._corrupt_record.isNull())
corrupt_user_records.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------+----------+----+--------------------+
|user_id|  location| age|     _corrupt_record|
+-------+----------+----+--------------------+
|   null|  Location|null|User-ID,Location,Age|
| 275081|cernusco s|null|  275081,"cernusco s|
|   null|     milan|null|    , milan, italy",|
+-------+----------+----+--------------------+



                                                                                

In [6]:
print(f'Number of corrupt records to drop: {corrupt_user_records.count()}')
users_df = users_df.filter(users_df._corrupt_record.isNull())
users_df = users_df.drop('_corrupt_record')
users_df.unpersist();

Number of corrupt records to drop: 3


In [7]:
books_schema = StructType([
    StructField('isbn', StringType(), True),
    StructField('book_title', StringType(), True), 
    StructField('book_author', StringType(), True),
    StructField('year_of_publication', IntegerType(), True),
    StructField('publisher', StringType(), True),
    StructField('image_url_s', StringType(), True),
    StructField('image_url_m', StringType(), True),
    StructField('image_url_l', StringType(), True),
    StructField('_corrupt_record', StringType(), True)
])

books_df = spark.read.csv(
    path='data/Books.csv', 
    schema=books_schema,
    mode='PERMISSIVE',
    columnNameOfCorruptRecord='_corrupt_record',
    escape='"',
).cache()

In [8]:
corrupt_book_records = books_df.filter(~books_df._corrupt_record.isNull())
corrupt_book_records.show()

[Stage 5:>                                                          (0 + 1) / 1]

+----------+--------------------+-----------+-------------------+--------------------+--------------------+--------------------+-----------+--------------------+
|      isbn|          book_title|book_author|year_of_publication|           publisher|         image_url_s|         image_url_m|image_url_l|     _corrupt_record|
+----------+--------------------+-----------+-------------------+--------------------+--------------------+--------------------+-----------+--------------------+
|      ISBN|          Book-Title|Book-Author|               null|           Publisher|         Image-URL-S|         Image-URL-M|Image-URL-L|ISBN,Book-Title,B...|
|078946697X|DK Readers: Creat...|       2000|               null|http://images.ama...|http://images.ama...|http://images.ama...|       null|078946697X,"DK Re...|
|2070426769|Peuple du ciel, s...|       2003|               null|http://images.ama...|http://images.ama...|http://images.ama...|       null|2070426769,"Peupl...|
|0789466953|DK Readers: Crea

                                                                                

In [9]:
print(f'Number of corrupt records to drop: {corrupt_book_records.count()}')
books_df = books_df.filter(books_df._corrupt_record.isNull())
books_df = books_df.drop('_corrupt_record')
books_df.unpersist();

Number of corrupt records to drop: 4


In [10]:
ratings_schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('isbn', StringType(), True), 
    StructField('book_rating', IntegerType(), True),
    StructField('_corrupt_record', StringType(), True)
])

ratings_df = spark.read.csv(
    path='data/Ratings.csv', 
    schema=ratings_schema,
    mode='PERMISSIVE',
    columnNameOfCorruptRecord='_corrupt_record',
).cache()

In [11]:
corrupt_rating_records = ratings_df.filter(~ratings_df._corrupt_record.isNull())
corrupt_rating_records.show()

[Stage 9:>                                                          (0 + 1) / 1]

+-------+----+-----------+--------------------+
|user_id|isbn|book_rating|     _corrupt_record|
+-------+----+-----------+--------------------+
|   null|ISBN|       null|User-ID,ISBN,Book...|
+-------+----+-----------+--------------------+



                                                                                

In [12]:
print(f'Number of corrupt records to drop: {corrupt_rating_records.count()}')
ratings_df = ratings_df.filter(ratings_df._corrupt_record.isNull())
ratings_df = ratings_df.drop('_corrupt_record')
ratings_df.unpersist();

Number of corrupt records to drop: 1


First let's check if there are duplicate values in the dataframes.

In [13]:
print(f'Dupliactes in users_df: {users_df.distinct().count() != users_df.count()}')
print(f'Dupliactes in books_df: {books_df.distinct().count() != books_df.count()}')
print(f'Dupliactes in ratings_df: {ratings_df.distinct().count() != ratings_df.count()}')

                                                                                

Dupliactes in users_df: False


                                                                                

Dupliactes in books_df: False




Dupliactes in ratings_df: False


                                                                                

Now let's count missing values.

In [14]:
from pyspark.sql.functions import when, count, col

for df in users_df, books_df, ratings_df:
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------+--------+------+
|user_id|location|   age|
+-------+--------+------+
|      0|       0|110761|
+-------+--------+------+

+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+
|isbn|book_title|book_author|year_of_publication|publisher|image_url_s|image_url_m|image_url_l|
+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+
|   0|         0|          1|                  0|        2|          0|          0|          0|
+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+

+-------+----+-----------+
|user_id|isbn|book_rating|
+-------+----+-----------+
|      0|   0|          0|
+-------+----+-----------+



In [15]:
books_df.filter(books_df['book_author'].isNull()).show()

+----------+--------------------+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      isbn|          book_title|book_author|year_of_publication|           publisher|         image_url_s|         image_url_m|         image_url_l|
+----------+--------------------+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+
|9627982032|The Credit Suisse...|       null|               1995|Edinburgh Financi...|http://images.ama...|http://images.ama...|http://images.ama...|
+----------+--------------------+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+



In [16]:
no_book_author_isbns = books_df.filter(books_df['book_author'].isNull()).select('isbn').collect()
ratings_df.filter(ratings_df.isbn.isin([row[0] for row in no_book_author_isbns])).count()

1

Book with book_author missing was rated once. 

In [17]:
books_df.filter(books_df['publisher'].isNull()).show()

+----------+---------------+---------------+-------------------+---------+--------------------+--------------------+--------------------+
|      isbn|     book_title|    book_author|year_of_publication|publisher|         image_url_s|         image_url_m|         image_url_l|
+----------+---------------+---------------+-------------------+---------+--------------------+--------------------+--------------------+
|193169656X|    Tyrant Moon|Elaine Corvidae|               2002|     null|http://images.ama...|http://images.ama...|http://images.ama...|
|1931696993|Finders Keepers|Linnea Sinclair|               2001|     null|http://images.ama...|http://images.ama...|http://images.ama...|
+----------+---------------+---------------+-------------------+---------+--------------------+--------------------+--------------------+



In [18]:
no_publisher_isbns = books_df.filter(books_df['publisher'].isNull()).select('isbn').collect()
ratings_df.filter(ratings_df.isbn.isin([row[0] for row in no_publisher_isbns])).count()

2

Books with publisher missing were rated a total of two times. 

In [19]:
books_df = books_df.na.fill('Unknown')

In [20]:
books_df = books_df.drop('image_url_s', 'image_url_m', 'image_url_l')

In [21]:
from pyspark.sql.functions import countDistinct

print('Distinct isbn values in books_df:')
books_df.agg(countDistinct(col("isbn"))).show()

print('Distinct isbn values in ratings_df:')
ratings_df.agg(countDistinct(col("isbn"))).show()

Distinct isbn values in books_df:


                                                                                

+-----------+
|count(isbn)|
+-----------+
|     271357|
+-----------+

Distinct isbn values in ratings_df:




+-----------+
|count(isbn)|
+-----------+
|     340556|
+-----------+



                                                                                

# Popularity-based recommender system

In [22]:
df = ratings_df.join(users_df, on='user_id', how='left')
df = df.join(books_df, on='isbn', how='left')

In [23]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

                                                                                

+----+-------+-----------+--------+------+----------+-----------+-------------------+---------+
|isbn|user_id|book_rating|location|   age|book_title|book_author|year_of_publication|publisher|
+----+-------+-----------+--------+------+----------+-----------+-------------------+---------+
|   0|      0|          0|       0|309492|    118648|     118648|             118648|   118648|
+----+-------+-----------+--------+------+----------+-----------+-------------------+---------+



In [24]:
class PopularityBasedRecSys:
    
    def __init__(self, n_recs=5):
        self.n_recs = n_recs
        self.recs = None
        
    def fit(self, df):
#         self.recs = df.\
#                     groupBy('isbn').agg(count('isbn').alias('popularity')).\
#                     orderBy('popularity', ascending=False)
        df.createOrReplaceTempView('data')
        self.recs = spark.sql('''SELECT COUNT(isbn) AS popularity, isbn, book_title, book_author
                                 FROM data
                                 GROUP BY isbn, book_title, book_author
                                 ORDER BY COUNT(isbn) DESC''')
       
    def predict(self):
        return self.recs.limit(self.n_recs)


pop_recsys = PopularityBasedRecSys(n_recs=10)
pop_recsys.fit(df)
book_recs = pop_recsys.predict()
book_recs.show()



+----------+----------+--------------------+--------------+
|popularity|      isbn|          book_title|   book_author|
+----------+----------+--------------------+--------------+
|      2502|0971880107|         Wild Animus|  Rich Shapero|
|      1295|0316666343|The Lovely Bones:...|  Alice Sebold|
|       883|0385504209|   The Da Vinci Code|     Dan Brown|
|       732|0060928336|Divine Secrets of...| Rebecca Wells|
|       723|0312195516|The Red Tent (Bes...| Anita Diamant|
|       647|044023722X|     A Painted House|  John Grisham|
|       639|0679781587|                null|          null|
|       615|0142001740|The Secret Life o...| Sue Monk Kidd|
|       614|067976402X|Snow Falling on C...|David Guterson|
|       586|0671027360| Angels &amp; Demons|     Dan Brown|
+----------+----------+--------------------+--------------+



                                                                                

In [25]:
books_df.filter(books_df.isbn == '0679781587').show()

+----+----------+-----------+-------------------+---------+
|isbn|book_title|book_author|year_of_publication|publisher|
+----+----------+-----------+-------------------+---------+
+----+----------+-----------+-------------------+---------+



Since 0679781587 isbn is not present in books_df, book_title and book_author will not be present in the resulting data frame of recommendations.

In [26]:
class HighestRatedPopularityBasedRecSys(PopularityBasedRecSys):
    
    def __init__(self, min_num_ratings=100, **kwargs):
        super().__init__(**kwargs)
        self.min_num_ratings = min_num_ratings
        
    def fit(self, df):
        df.createOrReplaceTempView('data')
        self.recs = spark.sql(f'''SELECT isbn, ROUND(AVG(book_rating), 2) AS popularity, book_title, book_author
                                  FROM data
                                  GROUP BY isbn, book_title, book_author
                                  HAVING COUNT(isbn) > {self.min_num_ratings}
                                  ORDER BY popularity DESC''')
        
        
highest_rated_pop_recsys = HighestRatedPopularityBasedRecSys(n_recs=15)
highest_rated_pop_recsys.fit(df)
highest_rated_book_recs = highest_rated_pop_recsys.predict()
highest_rated_book_recs.show()



+----------+----------+--------------------+----------------+
|      isbn|popularity|          book_title|     book_author|
+----------+----------+--------------------+----------------+
|0439064864|      6.61|Harry Potter and ...|   J. K. Rowling|
|0439139597|      6.54|Harry Potter and ...|   J. K. Rowling|
|0439136350|      6.47|Harry Potter and ...|   J. K. Rowling|
|0590353403|      6.36|Harry Potter and ...|   J. K. Rowling|
|043935806X|      5.57|Harry Potter and ...|   J. K. Rowling|
|0439136369|      5.35|Harry Potter and ...|   J. K. Rowling|
|0812550706|       5.3|Ender's Game (End...|Orson Scott Card|
|0671027344|      5.19|The Perks of Bein...| Stephen Chbosky|
|0439139600|       5.1|Harry Potter and ...|   J. K. Rowling|
|0345339681|      5.01|The Hobbit : The ...|  J.R.R. TOLKIEN|
|0446310786|      4.92|To Kill a Mocking...|      Harper Lee|
|0440219078|      4.92|The Giver (21st C...|      LOIS LOWRY|
|0553375407|      4.91|Ishmael: An Adven...|    Daniel Quinn|
|0590353

                                                                                

# Model-based collaborative filtering

In [27]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='isbn', outputCol='isbn_indexed')
ratings_df = indexer.fit(ratings_df).transform(ratings_df)

                                                                                

In [28]:
ratings_df.show(5)

23/11/11 18:35:02 WARN DAGScheduler: Broadcasting large task binary with size 10.1 MiB


+-------+----------+-----------+------------+
|user_id|      isbn|book_rating|isbn_indexed|
+-------+----------+-----------+------------+
| 276725|034545104X|          0|      1637.0|
| 276726|0155061224|          5|     89066.0|
| 276727|0446520802|          0|       568.0|
| 276729|052165615X|          3|    205975.0|
| 276729|0521795028|          6|    206005.0|
+-------+----------+-----------+------------+
only showing top 5 rows



                                                                                

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

train, test = ratings_df.randomSplit([0.8, 0.2])

als = ALS(
    userCol='user_id',
    itemCol='isbn_indexed',
    ratingCol='book_rating',
    nonnegative=True,
    coldStartStrategy='drop'
)

evaluator = RegressionEvaluator(labelCol='book_rating')

params = ParamGridBuilder()\
         .addGrid(als.rank, [10, 20,])\
         .addGrid(als.maxIter, [10, 15, 20])\
         .addGrid(als.regParam, [0.01, 0.1, 0.5])\
         .build()
         
tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=params,
    evaluator=evaluator
)

model = tvs.fit(train)
preds = model.transform(test)
rmse = evaluator.evaluate(preds)