In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName("book-recs").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

23/11/08 20:51:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
users_df = spark.read.csv(path="data/Users.csv", header=True, inferSchema=True)
books_df = spark.read.csv(path="data/Books.csv", header=True, inferSchema=True)
ratings_df = spark.read.csv(path="data/Ratings.csv", header=True, inferSchema=True)

                                                                                

In [4]:
for col in users_df.columns:
    users_df = users_df.withColumnRenamed(col, col.replace('-', '_').lower())
    
for col in books_df.columns:
    books_df = books_df.withColumnRenamed(col, col.replace('-', '_').lower())

for col in ratings_df.columns:
    ratings_df = ratings_df.withColumnRenamed(col, col.replace('-', '_').lower())

In [5]:
dfs = [users_df, books_df, ratings_df]

In [6]:
for df in dfs:
    df.printSchema()
    print(f'Rows in DataFrame: {df.count()}')
    df.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- age: string (nullable = true)

Rows in DataFrame: 278859
+-------+--------------------+----+
|user_id|            location| age|
+-------+--------------------+----+
|      1|  nyc, new york, usa|null|
|      2|stockton, califor...|18.0|
|      3|moscow, yukon ter...|null|
|      4|porto, v.n.gaia, ...|17.0|
|      5|farnborough, hant...|null|
+-------+--------------------+----+
only showing top 5 rows

root
 |-- isbn: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- book_author: string (nullable = true)
 |-- year_of_publication: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- image_url_s: string (nullable = true)
 |-- image_url_m: string (nullable = true)
 |-- image_url_l: string (nullable = true)

Rows in DataFrame: 271360
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------

In [7]:
books_df = books_df.drop('image_url_s', 'image_url_m', 'image_url_l')

First let's check if there are duplicate values in the dataframes.

In [8]:
print(f'Dupliactes in users_df: {users_df.distinct().count() != users_df.count()}')
print(f'Dupliactes in books_df: {books_df.distinct().count() != books_df.count()}')
print(f'Dupliactes in ratings_df: {ratings_df.distinct().count() != ratings_df.count()}')

                                                                                

Dupliactes in users_df: False


                                                                                

Dupliactes in books_df: False


                                                                                

Dupliactes in ratings_df: False


Now let's count missing values.

In [9]:
from pyspark.sql.functions import when, count, col

for df in dfs:
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------+--------+------+
|user_id|location|   age|
+-------+--------+------+
|      1|       0|110518|
+-------+--------+------+



                                                                                

+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+
|isbn|book_title|book_author|year_of_publication|publisher|image_url_s|image_url_m|image_url_l|
+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+
|   0|         0|          1|                  0|        2|          0|          0|          3|
+----+----------+-----------+-------------------+---------+-----------+-----------+-----------+





+-------+----+-----------+
|user_id|isbn|book_rating|
+-------+----+-----------+
|      0|   0|          0|
+-------+----+-----------+



                                                                                

# Popularity-based recommender system users_df

In [10]:
df = ratings_df.join(users_df, on='user_id', how='left')
df = df.join(books_df, on='isbn', how='left')

In [11]:
class PopularityBasedRecSys:
    def __init__(self, n_recs=5):
        self.n_recs = n_recs
        self.recs = None
        
    def fit(self, df):
        self.recs = df.\
                    groupBy('book_title', 'book_author').agg(count('book_title').alias('popularity')).\
                    orderBy('popularity', ascending=False)
       
    def predict(self):
        return self.recs.limit(self.n_recs)

In [12]:
pop_recsys = PopularityBasedRecSys()
pop_recsys.fit(df)
pop_recsys.predict().show()

                                                                                

+--------------------+---------------+----------+
|          book_title|    book_author|popularity|
+--------------------+---------------+----------+
|         Wild Animus|   Rich Shapero|      2502|
|The Lovely Bones:...|   Alice Sebold|      1295|
|   The Da Vinci Code|      Dan Brown|       887|
|The Nanny Diaries...|Emma McLaughlin|       828|
|Bridget Jones's D...| Helen Fielding|       815|
+--------------------+---------------+----------+



In [13]:
# from pyspark.sql.functions import split, element_at

# users_df = users_df.withColumn('country', element_at(split(col('Location'), ','), -1))
# users_df.show(5)

In [14]:
class PopularityByCountryRecSys:
    pass